In [1]:
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load Data

In [2]:
df = pd.read_pickle('../data/processed_arxiv_data.pkl')

# Vectorization (TF-IDF)

In [3]:
vectorizer = TfidfVectorizer(
    max_features=500000,
    max_df=0.8, # Ignore terms that appear in more than 80% of documents
    min_df=5, # Ignore terms that appear in fewer than 5 documents
)

In [4]:
# Fit the vectorizer on the cleaned abstracts and transform them into vectors
tfidf_matrix = vectorizer.fit_transform(df['abstract_clean'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

TF-IDF matrix shape: (998709, 165273)


In [5]:
with open('../data/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('../data/tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

# Search Function

In [6]:
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    words = text.split()
    words = [word for word in words if word not in set()]
    return ' '.join(words)

In [7]:
def search_papers(query, vectorizer, tfidf_matrix, df, top_k=10):
    query_clean = clean_text(query) # Clean and preprocess the query
    
    query_vec = vectorizer.transform([query_clean]) # Vectorize the query using fitted vectorizer
    
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten() # Calculate cosine similarity between query and all abstracts
    top_indices = cosine_sim.argsort()[-top_k:][::-1]
    
    results = df.iloc[top_indices].copy()
    results['similarity_score'] = cosine_sim[top_indices]
    
    return results

# Example

In [8]:
top_papers = search_papers("papers about curing diseases with AI", vectorizer, tfidf_matrix, df, top_k=5)

In [9]:
top_papers

Unnamed: 0,id,authors,title,comments,journal-ref,doi,categories,abstract,update_date,abstract_clean,title_clean,primary_category,num_versions,first_version_date,similarity_score
910080,1711.0307,"Mikhail Hayhoe, Fady Alajaji, Bahman Gharesifard",Curing Epidemics on Networks using a Polya Con...,"12 pages, 11 figures",,,math.OC cs.SI math.PR,We study the curing of epidemics of a networ...,2017-11-09,study curing epidemics network contagion model...,curing epidemics networks using polya contagio...,math.OC,1,"Wed, 8 Nov 2017 17:53:18 GMT",0.417213
971690,1804.09997,"Jinyang Gao, Wei Wang, Meihui Zhang, Gang Chen...",PANDA: Facilitating Usable AI Development,,,,cs.AI cs.DB,Recent advances in artificial intelligence (...,2018-04-27,recent advances artificial intelligence ai mac...,panda facilitating usable ai development,cs.AI,1,"Thu, 26 Apr 2018 11:37:03 GMT",0.369809
957366,1803.07233,"Ziv Epstein, Blakeley H. Payne, Judy Hanwen Sh...",Closing the AI Knowledge Gap,"8 pages, 3 figures, under review",,,cs.CY cs.AI,AI researchers employ not only the scientifi...,2018-03-21,ai researchers employ scientific method method...,closing ai knowledge gap,cs.CY,1,"Tue, 20 Mar 2018 03:16:10 GMT",0.363059
473267,1310.8264,"Ralf Landgraf, Martin Rudolph, Robert Scherzer...",Modelling and simulation of adhesive curing pr...,final paper published at Journal of Computatio...,Comp Mech 54(2):547-565 (2014),10.1007/s00466-014-1005-5,cond-mat.mtrl-sci,This work deals with the modelling and simul...,2014-07-08,work deals modelling simulation curing phenome...,modelling simulation adhesive curing processes...,cond-mat.mtrl-sci,2,"Wed, 30 Oct 2013 18:39:22 GMT",0.324882
941573,1802.02033,"Robert Feldt, Francisco G. de Oliveira Neto, R...",Ways of Applying Artificial Intelligence in So...,,,,cs.SE,As Artificial Intelligence (AI) techniques h...,2018-02-08,artificial intelligence ai techniques powerful...,ways applying artificial intelligence software...,cs.SE,2,"Tue, 6 Feb 2018 16:20:17 GMT",0.323755
