In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_random_exponential
import os

import openai

from utils.embedding_utils import get_embedding, sliding_window

In [2]:
df = pd.read_csv("../data/db/df_text.csv")
df.head()

Unnamed: 0,source_type,text
0,paper,\begin{document}\n\n\preprint{MIT-CTP 5315}\n\...
1,paper,% ============================================...
2,paper,\begin{document}\n\n%\preprint{}\n\n\title{Pre...
3,paper,Once detector data has been processed and high...
4,paper,\begin{document}\n\preprint{MIT--CTP 5223}\n\n...


In [3]:
df_filtered = df

In [4]:
window_size = 256  # Length of text chunks
stride = 192 # Stride of sliding window; have a bit of overlap

text_chunks = []
embeddings = []

for i in tqdm(range(len(df_filtered))):
    text = df_filtered['text'].values[i].replace('\n', ' ').strip()
    text_chunks_i = list(sliding_window(text, window_size, stride))
    if len(text_chunks_i) > 80:  # If text too long, truncate
        text_chunks_i = text_chunks_i[:80]
    embeddings_i = [get_embedding(text) for text in text_chunks_i]
    text_chunks += text_chunks_i
    embeddings += embeddings_i

100%|████████████████████████████████████████████████████████| 68/68 [09:23<00:00,  8.29s/it]


In [5]:
embeddings = np.array(embeddings, dtype=np.float64)

In [6]:
data = [text_chunks]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['text_chunks']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

In [7]:
df.to_csv('../data/db/text_chunks.csv', index=False)
np.save('../data/db/embeddings.npy', embeddings)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
from utils.embedding_utils import get_embedding

def semantic_search(query_embedding, embeddings):
    """Load context prompt."""
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    ranked_indices = np.argsort(-similarities)
    return ranked_indices

In [16]:
emb = get_embedding("What public engagement have you done?")
semantic_search(emb, embeddings)

array([2848, 2858, 1645, ...,  910, 1509, 1654])

In [17]:
text_chunks[semantic_search(emb, embeddings)[0]]

'Public Engagement - Jesse Thaler, Jesse Thaler, About, Group, Research, Engagement, FAQ, CV, Contact, Public Engagement, Public Talks, Advocacy, Essays, I engage with the public through, classroom visits, opinion pieces, and lively presentations related to my research. I also advocate for the importance of open access to scientific data., Engagement CV, Public Talks, “Collision Course: Artificial Intelligence meets Fundamental Physics”, Keynote Presentation, “Tommy Flowers Network Conference”, Virtual, October 2020, “Confronting the Invisible Universe”, Public Talk, Aspen Center for Physics, March 2017, “The Higgs Boson: Triumph of the Standard Model”, 24th Annual Kavli Frontiers of Science, National Academy of Sciences, U.C. Irvine, November 2012, Advocacy, “Expanding the Space of Machine Learning for Physics”, APS Topical Group on Data Science Newsletter, Winter 2023, “Designing an AI Physicist”, Opinion Viewpoint, CERN Courier, September-October 2021, “Slow and Steady”, coauthored 