In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import openai

In [3]:
openai.api_key = ""

In [4]:
def sliding_window(text, window_size, stride):
    tokens = text.split()
    window_start = 0
    while window_start < len(tokens):
        window_end = min(window_start + window_size, len(tokens))
        yield ' '.join(tokens[window_start:window_end])
        window_start += stride
        
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']


In [5]:
df = pd.read_csv("../data/text/df_text.csv")
df.head()

Unnamed: 0,source_type,text
0,paper,"%\RequirePackage{lineno}\n\documentclass[aps,p..."
1,paper,"\documentclass[aps,prd,floatfix,preprintnumber..."
2,paper,"\documentclass[letterpaper,11pt]{article}\n\pd..."
3,paper,"\documentclass[11pt,letterpaper]{article}\n\pd..."
4,paper,"\documentclass[aps,twocolumn,nofootinbib,super..."


In [5]:
df_filtered = df[~df['source_location'].str.contains('hep')].sort_values(by=['source_location'])

In [6]:
last = 35
df_filtered[-last:].head()

Unnamed: 0,source_type,source_location,text
22,paper,https://arxiv.org/abs/1908.08542.pdf,MIT-CTP 5129\nExploring the Space of Jets with...
37,paper,https://arxiv.org/abs/1908.08949.pdf,MIT-CTP 5137\nQuantum Algorithms for Jet Clust...
125,paper,https://arxiv.org/abs/1909.00009.pdf,MIT{CTP 5143\nCircumnavigating Collinear Super...
18,paper,https://arxiv.org/abs/1911.04491.pdf,MIT-CTP 5150\nCutting Multiparticle Correlator...
74,paper,https://arxiv.org/abs/1911.09107.pdf,MIT-CTP 5155\nOmniFold: A Method to Simultaneo...


In [7]:
window_size = 256
stride = 128

text_chunks = []
embeddings = []

for i in tqdm(range(len(df_filtered[-last:]))):
    text = df_filtered[-last:]['text'].values[i].replace('\n', ' ').strip()
    text_chunks_i = list(sliding_window(text, window_size, stride))
    embeddings_i = [get_embedding(text) for text in text_chunks_i]
    text_chunks += text_chunks_i
    embeddings += embeddings_i

100%|████████████████████████████████████████████████████████| 35/35 [02:38<00:00,  4.52s/it]


In [187]:
embeddings = np.array(embeddings, dtype=np.float64)

In [189]:
data = [text_chunks, embeddings]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['text_chunks', 'embeddings']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

In [190]:
df.to_csv('../data/embeddings/text_chunks.csv', index=False)

In [192]:
np.save('../data/embeddings/embeddings.npy', embeddings)