In [1]:
# !pip install ipywidgets  #1 time installation
# !jupyter nbextension enable --py widgetsnbextension
# import sys, subprocess
#subprocess.check_call([sys.executable, "-m", "pip", "install", "ipywidgets"])
#!pip install scikit-learn
#import sys
#!{sys.executable} -m pip install scikit-learn

In [2]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import Javascript
from ipywidgets import Widget, Text, Button, VBox, Label
from traitlets import Unicode

In [3]:
base_path = Path().resolve().parent.parent 
corpus_path = Path(os.path.join(base_path, "corpus_json", "corpus.json"))

with corpus_path.open("r", encoding="utf-8") as f:
    corpus = json.load(f)

doc_ids   = [doc["id"] for doc in corpus]
doc_titles = [doc.get("title", "") for doc in corpus]
doc_texts = [doc["text"] for doc in corpus]


vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),   # unigrams + bigrams
    max_df=0.9,
    min_df=1
)

tfidf_matrix = vectorizer.fit_transform(doc_texts)  # num_docs, num_terms


In [4]:
start = 12005
stop = start + 10

feature_names = vectorizer.get_feature_names_out()
print("Number of features:", len(feature_names))
print("Sample features:", feature_names[start:stop])   


first_doc_vector = tfidf_matrix[0].toarray()[0]
nonzero_idx = first_doc_vector.nonzero()[0]

print("\nTF-IDF values for sample features):")
for idx in range(start, stop):
    print(f"{feature_names[idx]:25s} {first_doc_vector[idx]:.4f}")


df = pd.DataFrame({
    "term": [feature_names[i] for i in nonzero_idx],
    "tfidf": [first_doc_vector[i] for i in nonzero_idx]
}).sort_values("tfidf", ascending=False)

print("\nTop 10 TF-IDF terms in Document 0:")
print(df.head(10))


Number of features: 50406
Sample features: ['crime' 'crime absence' 'crime acknowledge' 'crime actually'
 'crime aggregations' 'crime analysis' 'crime appeared' 'crime attractors'
 'crime baltimore' 'crime best']

TF-IDF values for sample features):
crime                     0.6713
crime absence             0.0043
crime acknowledge         0.0043
crime actually            0.0043
crime aggregations        0.0043
crime analysis            0.0043
crime appeared            0.0043
crime attractors          0.0043
crime baltimore           0.0043
crime best                0.0043

Top 10 TF-IDF terms in Document 0:
                   term     tfidf
1406              crime  0.671330
3568           mobility  0.209394
986              cities  0.138597
1484             crimes  0.134266
2412        forecasting  0.129443
3175                 lb  0.125604
3756                 nn  0.106497
1437  crime forecasting  0.090954
863                cell  0.088116
4189                poi  0.086623


In [5]:
def retrieve_tfidf(query: str, k: int = 5):
    
    # Returns top-k documents.

    if not query.strip():
        return []

    # Vectorize query
    q_vec = vectorizer.transform([query])  # shape: (1, num_terms)

    # Cosine similarity with all docs
    sims = cosine_similarity(q_vec, tfidf_matrix)[0]  # shape: (num_docs,)

    # Get top-k indices
    topk_idx = np.argsort(sims)[::-1][:k]

    results = []
    for rank, idx in enumerate(topk_idx, start=1):
        results.append({
            "rank": rank,
            "score": float(sims[idx]),
            "id": doc_ids[idx],
            "title": doc_titles[idx],
            "text": doc_texts[idx]
        })
    return results


In [6]:
# test all queries
query_path = Path(os.path.join(base_path, "queries_json", "queries.json"))

with open(query_path, "r", encoding="utf-8") as f:
    test_queries = json.load(f)

results = []

for entry in test_queries:
    question = entry["question"]
    correct_paper = str(entry["correct_paper_id"]) # id of the correct paper
    correct_paper_title = entry["correct_paper_title"]
    
    retrieved = retrieve_tfidf(query=question, k=3)

    print("=" * 80)
    print(f"Question: {question}")
    if correct_paper != "None":
        print(f"Correct paper: [{correct_paper}] {correct_paper_title}")
    print("-" * 80)

    for r in retrieved:
        print(f"[{r['rank']}] {r['id']}  (cosine similarity={r['score']:.4f})")
        print(f"Paper title: {r['title']}")
        print("-" * 80)
        print(r["text"][:500], "...")
        print()

    # store the top-1 and top-3 prediction
    top_pred = retrieved[0]['id']
    top_3_pred = [r['id'] for r in retrieved]

    results.append({
        "question": question,
        "predicted_paper": top_pred,
        "correct_paper": correct_paper,
        "is_correct": correct_paper == top_pred, # correct paper in top-1
        "is_in_top_3": correct_paper in top_3_pred # correct paper in top-3

    })


Question: How are Transformers different from RNNs?
--------------------------------------------------------------------------------
[1] 2510.05736v1  (cosine similarity=0.0217)
Paper title: Convolution and Graph-based Deep Learning Approaches for Gamma/Hadron Separation in Imaging Atmospheric Cherenkov Telescopes
--------------------------------------------------------------------------------
Convolution and Graph-based Deep Learning Approaches for Gamma/Hadron Separation in Imaging Atmospheric Cherenkov Telescopes. The identification of γ-rays from the predominant hadronic-background is a key aspect in their ground-based detection using Imaging Atmospheric Cherenkov Telescopes IACTs. While current methods are limited in their ability to exploit correlations in complex data deep learning-based models offer a promising alternative by directly leveraging image-level information. Howeve ...

[2] 2510.05163v1  (cosine similarity=0.0099)
Paper title: Deep Learning-Based Multi-Factor Authen

In [7]:
df_results = pd.DataFrame(results)

# replace "None" strings with actual NaN values
df_results['correct_paper'] = df_results['correct_paper'].replace(to_replace="None", value=np.nan)

# compute accuracy only on rows with a known correct paper
accuracy = df_results[df_results['correct_paper'].notna()]['is_correct'].mean()

valid_queries = df_results['correct_paper'].notna().sum()

print(f"Accuracy over {valid_queries} queries with known correct paper: {accuracy:.2%}")

df_results[df_results['correct_paper'].notna()]

Accuracy over 9 queries with known correct paper: 88.89%


Unnamed: 0,question,predicted_paper,correct_paper,is_correct,is_in_top_3
11,Are deep learning methods effective for crime ...,2509.20913v1,2509.20913v1,True,True
12,Should I train separate models for different c...,2509.20913v1,2509.20913v1,True,True
13,Which deep learning approaches work well for g...,2510.05736v1,2510.05736v1,True,True
14,What frameworks and optimization strategies we...,2510.08662v1,2510.08662v1,True,True
15,How do modern architectures perform on complex...,2510.09187v1,2510.09187v1,True,True
16,How can I train models stably with limited com...,2510.13137v1,2510.12850v1,False,False
17,Why is preprocessing important for Ethic-BERT’...,2510.12850v1,2510.12850v1,True,True
18,What are the main strengths of using an LSTM m...,2510.13137v1,2510.13137v1,True,True
19,How does model selection affect responsiveness...,2510.13137v1,2510.13137v1,True,True


In [8]:
def chunk_text(text, chunk_size=220, overlap=40):
    
    #Split text into overlapping chunks.
    #chunk_size: target words per chunk
    #overlap: how many words to overlap between consecutive chunks
    
    words = text.split()
    if not words:
        return []

    chunks = []
    start = 0
    n = len(words)

    while start < n:
        end = start + chunk_size
        chunk_words = words[start:end]
        chunk = " ".join(chunk_words)
        chunks.append(chunk)

        if end >= n:
            break

        start = end - overlap  

    return chunks

In [9]:
passage_texts = []   
passage_meta = []   

for doc in corpus:
    doc_id = doc["id"]
    title = doc.get("title", "")
    text = doc["text"]

    chunks = chunk_text(text, chunk_size=220, overlap=40)
    start_word = 0

    for i, chunk in enumerate(chunks):
        end_word = start_word + len(chunk.split())
        passage_texts.append(chunk)
        passage_meta.append({
            "doc_id": doc_id,
            "title": title,
            "chunk_id": f"{doc_id}_chunk_{i}",
            "start_word": start_word,
            "end_word": end_word,
        })
        start_word = end_word - 40  # keep aligned with overlap


In [10]:
print(f"Number of documents: {len(corpus)}")
print(f"Number of passages:  {len(passage_texts)}")

Number of documents: 20
Number of passages:  491


In [11]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    max_df=0.9,
    min_df=1
)

tfidf_matrix_passages = vectorizer.fit_transform(passage_texts)


In [12]:
def retrieve_tfidf_chunks(query: str, k: int = 3):
    
    #Retrieve top-k passages (chunks)
    
    if not query.strip():
        return []

    # Vectorize query
    q_vec = vectorizer.transform([query])

    # Cosine similarity against all passages
    sims = cosine_similarity(q_vec, tfidf_matrix_passages)[0]

    # Top-k indices
    topk_idx = np.argsort(sims)[::-1][:k]

    results = []
    for rank, idx in enumerate(topk_idx, start=1):
        meta = passage_meta[idx]
        results.append({
            "rank": rank,
            "score": float(sims[idx]),
            "text": passage_texts[idx],
            "doc_id": meta["doc_id"],
            "title": meta["title"],
            "chunk_id": meta["chunk_id"],
            "start_word": meta["start_word"],
            "end_word": meta["end_word"],
        })

    return results


In [13]:
# test all queries
results = []

for entry in test_queries:
    question = entry["question"]
    correct_paper = str(entry["correct_paper_id"])  # id of the correct paper
    correct_paper_title = entry["correct_paper_title"]
    
    retrieved = retrieve_tfidf_chunks(query=question, k=3)

    print("=" * 80)
    print(f"Question: {question}")
    if correct_paper != "None":
        print(f"Correct paper: [{correct_paper}] {correct_paper_title}")
        print(f"Correct passage: {entry['correct_passage']}")
    print("-" * 80)

    for r in retrieved:
        print(f"[{r['rank']}] Cosine similarity={r['score']:.4f}")
        print(f"Paper: {r['doc_id']} — {r['title']}")
        print(f"Chunk: {r['chunk_id']} (words {r['start_word']}–{r['end_word']})")
        print("-" * 80)
        print(r["text"][:500], "...")
        print()

    top_pred = retrieved[0]['doc_id'] # store only the top-1 prediction
    top_3_pred = [r['doc_id'] for r in retrieved] # store top-3 predictions

    results.append({
        "question": question,
        "predicted_paper": top_pred,
        "correct_paper": correct_paper,
        "is_correct": correct_paper == top_pred, # correct paper in top-1
        "is_in_top_3": correct_paper in top_3_pred # correct paper in top-3
    })


Question: How are Transformers different from RNNs?
--------------------------------------------------------------------------------
[1] Cosine similarity=0.0871
Paper: 2510.05736v1 — Convolution and Graph-based Deep Learning Approaches for Gamma/Hadron Separation in Imaging Atmospheric Cherenkov Telescopes
Chunk: 2510.05736v1_chunk_2 (words 360–580)
--------------------------------------------------------------------------------
Trees BDTs trained on parameterized image features or goodness-of-fit parameters for this task 3 5. Consequently a natural motivation for exploring deep learning-based models stems from the possibility of improving event classification by directly using image-level information. Multiple studies have explored deep learning methods for identifying γ-rays and demonstrated exceptional performance on simulated data 6 10. Most model architectures use convolutional neural networks CNNs for extracting i ...

[2] Cosine similarity=0.0667
Paper: 2510.05163v1 — Deep Lear

In [14]:
df_results = pd.DataFrame(results)

# replace "None" strings with actual NaN values
df_results['correct_paper'] = df_results['correct_paper'].replace(to_replace="None", value=np.nan)

# compute accuracy only on rows with a known correct paper
accuracy = df_results[df_results['correct_paper'].notna()]['is_correct'].mean()

valid_queries = df_results['correct_paper'].notna().sum()

print(f"Accuracy over {valid_queries} queries with known correct paper: {accuracy:.2%}")

df_results[df_results['correct_paper'].notna()]






Accuracy over 9 queries with known correct paper: 77.78%


Unnamed: 0,question,predicted_paper,correct_paper,is_correct,is_in_top_3
11,Are deep learning methods effective for crime ...,2509.20913v1,2509.20913v1,True,True
12,Should I train separate models for different c...,2509.20913v1,2509.20913v1,True,True
13,Which deep learning approaches work well for g...,2510.05736v1,2510.05736v1,True,True
14,What frameworks and optimization strategies we...,2510.08770v1,2510.08662v1,False,True
15,How do modern architectures perform on complex...,2510.09187v1,2510.09187v1,True,True
16,How can I train models stably with limited com...,2510.12758v1,2510.12850v1,False,False
17,Why is preprocessing important for Ethic-BERT’...,2510.12850v1,2510.12850v1,True,True
18,What are the main strengths of using an LSTM m...,2510.13137v1,2510.13137v1,True,True
19,How does model selection affect responsiveness...,2510.13137v1,2510.13137v1,True,True
