In [15]:
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load Data

In [16]:
df = pd.read_pickle('../data/processed_arxiv_data.pkl')

# Vectorization (TF-IDF)

In [17]:
title_vectorizer = TfidfVectorizer(
    max_features=500000,
    max_df=0.8, # Ignore terms that appear in more than 80% of documents
    min_df=2, # Ignore terms that appear in fewer than 2 documents
)

In [18]:
title_tfidf = title_vectorizer.fit_transform(df['title_clean'])

print(f"TF-IDF title matrix shape: {title_tfidf.shape}")

TF-IDF title matrix shape: (998709, 100891)


In [19]:
with open('../data/tfidf_title_vectorizer.pkl', 'wb') as f:
    pickle.dump(title_vectorizer, f)
    
with open('../data/tfidf_title.pkl', 'wb') as f:
    pickle.dump(title_tfidf, f)

In [20]:
vectorizer = TfidfVectorizer(
    max_features=500000,
    max_df=0.8, # Ignore terms that appear in more than 80% of documents
    min_df=5, # Ignore terms that appear in fewer than 5 documents
)

In [21]:
abstract_tfidf = vectorizer.fit_transform(df['abstract_clean'])

print(f"TF-IDF matrix shape: {abstract_tfidf.shape}")

TF-IDF matrix shape: (998709, 165273)


In [22]:
with open('../data/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    
with open('../data/tfidf_abstract.pkl', 'wb') as f:
    pickle.dump(abstract_tfidf, f)

In [23]:
# Load later
with open('../data/tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

with open('../data/tfidf_abstract.pkl', 'rb') as f:
    abstract_tfidf = pickle.load(f)

with open('../data/tfidf_title_vectorizer.pkl', 'rb') as f:
    title_vectorizer = pickle.load(f)

with open('../data/tfidf_title.pkl', 'rb') as f:
    title_tfidf = pickle.load(f)

# Search Function

In [24]:
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    words = text.split()
    words = [word for word in words if word not in set()]
    return ' '.join(words)

In [25]:
def search_papers(query, vectorizer, tfidf_matrix, df, top_k=10):
    query_clean = clean_text(query) # Clean and preprocess the query
    
    query_vec = vectorizer.transform([query_clean]) # Vectorize the query using fitted vectorizer
    
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten() # Calculate cosine similarity between query and all abstracts
    top_indices = cosine_sim.argsort()[-top_k:][::-1]
    
    results = df.iloc[top_indices].copy()
    results['similarity_score'] = cosine_sim[top_indices]
    
    return results

# Example

In [26]:
# abstract
search_papers("papers about curing diseases with AI", vectorizer, abstract_tfidf, df, top_k=5)

Unnamed: 0,id,authors,title,comments,journal-ref,doi,categories,abstract,update_date,abstract_clean,title_clean,primary_category,num_versions,first_version_date,similarity_score
910080,1711.0307,"Mikhail Hayhoe, Fady Alajaji, Bahman Gharesifard",Curing Epidemics on Networks using a Polya Con...,"12 pages, 11 figures",,,math.OC cs.SI math.PR,We study the curing of epidemics of a networ...,2017-11-09,study curing epidemics network contagion model...,curing epidemics networks using polya contagio...,math.OC,1,"Wed, 8 Nov 2017 17:53:18 GMT",0.417213
971690,1804.09997,"Jinyang Gao, Wei Wang, Meihui Zhang, Gang Chen...",PANDA: Facilitating Usable AI Development,,,,cs.AI cs.DB,Recent advances in artificial intelligence (...,2018-04-27,recent advances artificial intelligence ai mac...,panda facilitating usable ai development,cs.AI,1,"Thu, 26 Apr 2018 11:37:03 GMT",0.369809
957366,1803.07233,"Ziv Epstein, Blakeley H. Payne, Judy Hanwen Sh...",Closing the AI Knowledge Gap,"8 pages, 3 figures, under review",,,cs.CY cs.AI,AI researchers employ not only the scientifi...,2018-03-21,ai researchers employ scientific method method...,closing ai knowledge gap,cs.CY,1,"Tue, 20 Mar 2018 03:16:10 GMT",0.363059
473267,1310.8264,"Ralf Landgraf, Martin Rudolph, Robert Scherzer...",Modelling and simulation of adhesive curing pr...,final paper published at Journal of Computatio...,Comp Mech 54(2):547-565 (2014),10.1007/s00466-014-1005-5,cond-mat.mtrl-sci,This work deals with the modelling and simul...,2014-07-08,work deals modelling simulation curing phenome...,modelling simulation adhesive curing processes...,cond-mat.mtrl-sci,2,"Wed, 30 Oct 2013 18:39:22 GMT",0.324882
941573,1802.02033,"Robert Feldt, Francisco G. de Oliveira Neto, R...",Ways of Applying Artificial Intelligence in So...,,,,cs.SE,As Artificial Intelligence (AI) techniques h...,2018-02-08,artificial intelligence ai techniques powerful...,ways applying artificial intelligence software...,cs.SE,2,"Tue, 6 Feb 2018 16:20:17 GMT",0.323755


In [27]:
# title
search_papers("deep learning in medical", title_vectorizer, title_tfidf, df, top_k=5)

Unnamed: 0,id,authors,title,comments,journal-ref,doi,categories,abstract,update_date,abstract_clean,title_clean,primary_category,num_versions,first_version_date,similarity_score
883999,1708.08987,"Mina Rezaei, Haojin Yang and Christoph Meinel",Deep Learning for Medical Image Analysis,Presented in doctoral consortium in the AIME-2...,,,cs.CV,This report describes my research activities...,2017-08-31,report describes research activities hasso pla...,deep learning medical image analysis,cs.CV,1,"Thu, 17 Aug 2017 12:09:12 GMT",0.824845
622272,1505.02,Matthew Lai,Deep Learning for Medical Image Segmentation,,,,cs.LG cs.AI cs.CV,This report provides an overview of the curr...,2015-05-11,report provides overview current state art dee...,deep learning medical image segmentation,cs.LG,1,"Fri, 8 May 2015 11:35:53 GMT",0.767622
820341,1702.05747,"Geert Litjens, Thijs Kooi, Babak Ehteshami Bej...",A Survey on Deep Learning in Medical Image Ana...,Revised survey includes expanded discussion se...,Med Image Anal. (2017) 42:60-88,10.1016/j.media.2017.07.005,cs.CV,"Deep learning algorithms, in particular conv...",2019-01-31,deep learning algorithms particular convolutio...,survey deep learning medical image analysis,cs.CV,2,"Sun, 19 Feb 2017 13:02:28 GMT",0.761767
958824,1803.08691,"Holger R. Roth, Chen Shen, Hirohisa Oda, Masah...",Deep learning and its application to medical i...,Accepted for publication in the journal of the...,"Medical Imaging Technology, Volume 36 (2018), ...",10.11409/mit.36.63,cs.CV,One of the most common tasks in medical imag...,2018-04-10,common tasks medical imaging semantic segmenta...,deep learning application medical image segmen...,cs.CV,1,"Fri, 23 Mar 2018 08:55:10 GMT",0.717212
966989,1804.05296,"Samuel G. Finlayson, Hyung Won Chung, Isaac S....",Adversarial Attacks Against Medical Deep Learn...,,,,cs.CR cs.CY cs.LG stat.ML,The discovery of adversarial examples has ra...,2019-02-05,discovery adversarial examples raised concerns...,adversarial attacks medical deep learning systems,cs.CR,3,"Sun, 15 Apr 2018 02:33:08 GMT",0.694803
