In [None]:
# --- Install dependencies ---
!pip install -q kagglehub pymupdf sentence-transformers scikit-learn pandas faiss-cpu

import kagglehub
import pandas as pd
import numpy as np
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import faiss
from google.colab import files


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np

# Generate some sample data
X = np.random.rand(100, 1) * 10
y = 2 * X + 1 + np.random.randn(100, 1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
score = model.score(X_test, y_test)
print(f"Model R-squared score: {score:.4f}")

# Make predictions
predictions = model.predict(X_test)
print("\nSample predictions:")
for i in range(5):
    print(f"Input: {X_test[i][0]:.2f}, Actual: {y_test[i][0]:.2f}, Predicted: {predictions[i][0]:.2f}")

Model R-squared score: 0.9796

Sample predictions:
Input: 8.72, Actual: 17.81, Predicted: 18.41
Input: 9.22, Actual: 18.51, Predicted: 19.40
Input: 2.55, Actual: 6.32, Predicted: 6.32
Input: 6.16, Actual: 11.93, Predicted: 13.39
Input: 1.65, Actual: 3.05, Predicted: 4.54


In [None]:
# ============================================================
# Step 1: Download and Load Dataset
# ============================================================
import json
import os
import kagglehub

# Download latest version
path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)

# Try reading JSON file
json_path = f"{path}/arxiv-metadata-oai-snapshot.json"

papers = []
with open(json_path, 'r') as f:
    for i, line in enumerate(f):
        if i > 50000:   # limit to first 50k for Colab speed (you can remove this)
            break
        paper = json.loads(line)
        papers.append([paper['title'], paper['abstract']])

df = pd.DataFrame(papers, columns=['title', 'abstract'])
df['text'] = df['title'] + " " + df['abstract']

print("Dataset loaded:", df.shape)
print(df.head(2))

Using Colab cache for faster access to the 'arxiv' dataset.
Path to dataset files: /kaggle/input/arxiv
Dataset loaded: (50001, 3)
                                               title  \
0  Calculation of prompt diphoton production cros...   
1           Sparsity-certifying Graph Decompositions   

                                            abstract  \
0    A fully differential calculation in perturba...   
1    We describe a new algorithm, the $(k,\ell)$-...   

                                                text  
0  Calculation of prompt diphoton production cros...  
1  Sparsity-certifying Graph Decompositions   We ...  


In [None]:
# ============================================================
# Step 3: Embeddings + FAISS Index
# ============================================================
print("\n Building Sentence-BERT embeddings...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')

dim = embeddings.shape[1]
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

print("✅ Embeddings + FAISS index ready:", embeddings.shape)


 Building Sentence-BERT embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

✅ Embeddings + FAISS index ready: (50001, 384)


In [None]:
# ============================================================
# Step 4: PDF Text Extractor
# ============================================================
def extract_text_from_pdf(pdf_path):
    """Extract full text from PDF using PyMuPDF"""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [None]:
# ============================================================
# Step 5: Hybrid Recommendation System
# ============================================================
def recommend_papers(pdf_path, top_k=10, shortlist_k=100):
    """
    Hybrid recommender:
    1. Use SVD/LSA to shortlist candidates
    2. Use embeddings + FAISS to rerank top papers
    """
    # Extract PDF text
    input_text = extract_text_from_pdf(pdf_path)

    # ----- Phase 1: SVD-based shortlist -----
    input_vec = vectorizer.transform([input_text])
    input_reduced = svd.transform(input_vec)
    sims_svd = cosine_similarity(input_reduced, X_reduced)[0]
    top_indices_svd = sims_svd.argsort()[-shortlist_k:][::-1]  # shortlist

    # ----- Phase 2: Embedding reranking -----
    input_emb = model.encode([input_text]).astype('float32')
    faiss.normalize_L2(input_emb)

    # Restrict FAISS to shortlisted candidates
    candidate_embs = embeddings[top_indices_svd]
    rerank_index = faiss.IndexFlatIP(dim)
    rerank_index.add(candidate_embs)
    D, I = rerank_index.search(input_emb, top_k)

    # Map FAISS results back to dataset indices
    reranked_indices = [top_indices_svd[i] for i in I[0][:top_k]]

    # ----- Output -----
    print(f"\n📄 Top {top_k} Recommended Papers (Hybrid: SVD shortlist + Embedding rerank):")
    for rank, (idx, score) in enumerate(zip(reranked_indices, D[0]), start=1):
        print(f"\n{rank}. 🔹 Title: {df.iloc[idx]['title']}")
        print(f"   Score: {score:.4f}")
        print(f"   Abstract: {df.iloc[idx]['abstract'][:300]}...")
        print("-"*120)

In [None]:
# ============================================================
# Step 2: SVD/LSA Model for Shortlisting
# ============================================================
print("\n Building SVD/LSA model for shortlisting...")

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X = vectorizer.fit_transform(df['text'])

svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X)

print("✅ SVD/LSA model ready:", X_reduced.shape)


 Building SVD/LSA model for shortlisting...
✅ SVD/LSA model ready: (50001, 100)


In [None]:
# ============================================================
# Step 6: Upload a PDF and Get Recommendations
# ============================================================
print("\n⬆️ Please upload a research paper PDF...")
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

recommend_papers(pdf_path, top_k=10)


⬆️ Please upload a research paper PDF...


Saving Brain MRI Superresolution.pdf to Brain MRI Superresolution (1).pdf

📄 Top 10 Recommended Papers (Hybrid: SVD shortlist + Embedding rerank):

1. 🔹 Title: Automated identification of neurons and their locations
   Score: 0.3386
   Abstract:   Individual locations of many neuronal cell bodies (>10^4) are needed to
enable statistically significant measurements of spatial organization within
the brain such as nearest-neighbor and microcolumnarity measurements. In this
paper, we introduce an Automated Neuron Recognition Algorithm (ANRA) wh...
------------------------------------------------------------------------------------------------------------------------

2. 🔹 Title: Automated detection of lung nodules in low-dose computed tomography
   Score: 0.3368
   Abstract:   A computer-aided detection (CAD) system for the identification of pulmonary
nodules in low-dose multi-detector computed-tomography (CT) images has been
developed in the framework of the MAGIC-5 Italian project. One o