In [None]:
import json
from typing import List

import pandas as pd
import torch

from src.embedding import EmbeddingModel
from src.engine import SearchEngine

print("Pytorch version:", torch.__version__)
print("CUDA enabled:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name())

In [None]:
search_engine = SearchEngine()
embedding_model: EmbeddingModel = search_engine.embedding_model
embedding_model.model

In [None]:
# Download data from https://data.world/liz-friedman/arxiv-stem-scholarly-articles
with open("arxiv-metadata-oai-snapshot-2020-08-14.json", "r", encoding="utf-8") as file:
    data = [json.loads(line) for line in file]

dataframe = pd.DataFrame(data)
dataframe.drop(
    columns=[
        "id", "submitter", "comments", "report-no", "categories",
        "license", "versions", "authors_parsed", "doi",
    ],
    inplace=True
)
dataframe.dropna(inplace=True)

dataframe["update_date"] = pd.to_datetime(dataframe["update_date"])
dataframe.sort_values(by="update_date", ascending=False, inplace=True)


def clean_up_text(text: str) -> str:
    text = text.replace("\t", " ").replace("\n", " ")
    text = " ".join(text.split()).strip()
    return text


dataframe = dataframe[:5000]
print("Number of entries:", len(dataframe))
dataframe["abstract"] = dataframe["abstract"].apply(clean_up_text)
dataframe["title"] = dataframe["title"].apply(clean_up_text)
dataframe.head()

In [None]:
def create_embeddings(row: pd.Series) -> List[float]:
    text = row["title"] + ". " + row["abstract"]
    return embedding_model.get_embedding(text)


dataframe["embedding"] = dataframe.apply(create_embeddings, axis=1)
dataframe.head()

In [None]:
search_engine.fill_embedding_database(dataframe)