# Semantic Ranking

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load data
data_path_processed = '../data/processed/arxiv_papers.csv'

data = pd.read_csv(data_path_processed)
print(f"Shape of data: {data.shape}")
data.head()

Shape of data: (1000, 6)


Unnamed: 0,id,title,summary,category,published,combined
0,2012.11510v1,Design Rule Checking with a CNN Based Feature ...,Design rule checking (DRC) is getting increasi...,cs.LG,2020,design rule checking with a cnn based feature ...
1,2012.11638v1,Unsupervised in-distribution anomaly detection...,Anomaly detection is a key application of mach...,cs.LG|hep-ex|physics.data-an,2020,unsupervised in-distribution anomaly detection...
2,2012.11325v1,Detecting Botnet Attacks in IoT Environments: ...,The increased reliance on the Internet and the...,cs.CR|cs.LG|cs.NI,2020,detecting botnet attacks in iot environments: ...
3,2012.11327v1,Collaborative residual learners for automatic ...,Clinical coding is an administrative process t...,cs.IR|cs.LG,2020,collaborative residual learners for automatic ...
4,2012.11333v1,Ensemble model for pre-discharge icd10 coding ...,The translation of medical diagnosis to clinic...,cs.IR|cs.LG,2020,ensemble model for pre-discharge icd10 coding ...


## TF-IDF Baseline

In [3]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = vectorizer.fit_transform(data['combined']) # Creates vocabulary and transforms it into a sparse matrix

In [4]:
# Example query vectorization
query = "self-supervised learning for image representation"
query_vector = vectorizer.transform([query])

In [5]:
# Compute cosine similarity between query and documents
tfidf_scores = cosine_similarity(query_vector, tfidf_matrix)[0]
data['tfidf_score'] = tfidf_scores

In [6]:
data_sorted = data.sort_values(by='tfidf_score', ascending=False)
# show title and score of top 5 results

with pd.option_context('display.max_colwidth', None):
    display(data_sorted[['title', 'tfidf_score']].head(5))

Unnamed: 0,title,tfidf_score
966,CompRess: Self-Supervised Learning by Compressing Representations,0.50544
384,Look Listen and Attend: Co-Attention Network for Self-Supervised Audio-Visual Representation Learning,0.248077
510,Uncovering the structure of clinical EEG signals with self-supervised learning,0.229153
814,SLM: Learning a Discourse Language Representation with Sentence Unshuffling,0.207946
93,Self-supervised Body Image Acquisition Using a Deep Neural Network for Sensorimotor Prediction,0.204004
