In [1]:
import numpy as np
import pandas as pd

### Recommendation System

In [2]:
df = pd.read_csv('dataset/arxiv_data_210930-054931.csv')

In [3]:
df.head(2)

Unnamed: 0,terms,titles,abstracts
0,['cs.LG'],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...
1,"['cs.LG', 'cs.AI']",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...


In [4]:
df['titles'][7]

'Understanding and Resolving Performance Degradation in Graph Convolutional Networks'

In [5]:
df2 = df.copy()

In [6]:
df2.drop(columns=['terms','abstracts'])

Unnamed: 0,titles
0,Multi-Level Attention Pooling for Graph Neural...
1,Decision Forests vs. Deep Networks: Conceptual...
2,Power up! Robust Graph Convolutional Network v...
3,Releasing Graph Neural Networks with Different...
4,Recurrence-Aware Long-Term Cognitive Network f...
...,...
56176,Mining Spatio-temporal Data on Industrializati...
56177,Wav2Letter: an End-to-End ConvNet-based Speech...
56178,Deep Reinforcement Learning with Double Q-lear...
56179,Generalized Low Rank Models


In [8]:
from sentence_transformers import SentenceTransformer, util


In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
sentences = df2['titles']

In [11]:
sentences

0        Multi-Level Attention Pooling for Graph Neural...
1        Decision Forests vs. Deep Networks: Conceptual...
2        Power up! Robust Graph Convolutional Network v...
3        Releasing Graph Neural Networks with Different...
4        Recurrence-Aware Long-Term Cognitive Network f...
                               ...                        
56176    Mining Spatio-temporal Data on Industrializati...
56177    Wav2Letter: an End-to-End ConvNet-based Speech...
56178    Deep Reinforcement Learning with Double Q-lear...
56179                          Generalized Low Rank Models
56180    Chi-square Tests Driven Method for Learning th...
Name: titles, Length: 56181, dtype: object

In [12]:
word_embaddings = model.encode(sentences)

In [15]:
word_embaddings[0][0]

np.float32(0.06643405)

In [18]:
c = 0
for sentence,embadding in zip(sentences,word_embaddings):
    print("sentence",sentence,embadding)
    # print('embadding',embadding)
    if c>=1:
        break
    c +=1

sentence Multi-Level Attention Pooling for Graph Neural Networks: Unifying Graph Representations with Multiple Localities [ 6.64340481e-02 -4.95459773e-02  6.38808534e-02 -2.82659121e-02
  6.94541484e-02  5.14023937e-02 -3.92372571e-02 -6.01334982e-02
 -5.88572398e-03 -4.16622385e-02  2.61420552e-02 -7.35904798e-02
 -1.88339911e-02  5.29376417e-02 -3.59026715e-02  7.48135746e-02
  6.94931522e-02 -2.33735982e-03 -1.75370313e-02 -1.01530412e-03
  6.97972402e-02 -5.65803647e-02  5.96184321e-02  4.10587527e-02
  8.57779458e-02  4.63837609e-02 -5.10073379e-02 -4.05410267e-02
  4.53708395e-02  6.67746365e-03  1.06340572e-01  6.82450235e-02
  2.95312214e-03  6.91113174e-02 -1.53357228e-02  7.92678148e-02
 -1.49697274e-01  2.57561449e-02  3.93775515e-02  5.25156334e-02
 -5.99693740e-04  4.33895662e-02  3.59695293e-02  8.13651308e-02
  8.18701312e-02  2.40489673e-02 -2.82293018e-02  9.89285260e-02
  1.97786391e-02 -4.69627604e-02 -8.14746693e-03 -5.35850525e-02
 -5.72294742e-02 -6.20121732e-02 

In [19]:
import pickle
with open("word_embaddings.pkl",'wb') as f:
    pickle.dump(word_embaddings,f)

with open("sentences.pkl",'wb') as f:
    pickle.dump(sentences,f)
    
with open("model.pkl",'wb') as f:
    pickle.dump(model,f)

In [23]:
def predict(user_input, top_k=5):
    # Encode user input
    user_embedding = model.encode(user_input, convert_to_tensor=True)
    
    # Compute cosine similarity
    cosine_scores = util.cos_sim(user_embedding, word_embeddings)
    
    # Get top_k most similar sentences
    top_results = cosine_scores[0].topk(top_k)

    results = []
    for score, idx in zip(top_results.values, top_results.indices):
        results.append((sentences[idx.item()], float(score)))
    
    return results

In [24]:
results = predict("Transformer model for sequence learning")

for text, score in results:
    print(f"{text} (Similarity: {score:.4f})")

Recurrent Transform Learning (Similarity: 0.6936)
Decision Transformer: Reinforcement Learning via Sequence Modeling (Similarity: 0.6826)
Decision Transformer: Reinforcement Learning via Sequence Modeling (Similarity: 0.6826)
Learning Accurate Integer Transformer Machine-Translation Models (Similarity: 0.6774)
Few-shot Sequence Learning with Transformers (Similarity: 0.6690)
