# Installing necessary packages

In [None]:
!pip install lancedb
!pip install sentence_transformers
!pip install datasets

Collecting lancedb
  Downloading lancedb-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Collecting deprecation (from lancedb)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting pylance==0.20.0 (from lancedb)
  Downloading pylance-0.20.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (7.4 kB)
Collecting overrides>=0.7 (from lancedb)
  Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading lancedb-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl (29.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.9/29.9 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pylance-0.20.0-cp39-abi3-manylinux_2_28_x86_64.whl (33.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading overrides-7.7.0-py3-none-any.whl (17 kB)
Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: overrides, deprecat

# Mounting Drive to access the JSON dataset file


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import dask.bag as db
import json
data = db.read_text('/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json').map(json.loads)

In [None]:
data.take(1)

({'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

# Building a Stratified Sample of 100K records based on the category in the dataset

In [None]:
import pandas as pd
import json
from collections import Counter
from sklearn.model_selection import train_test_split

# Define the file path
file_name = '/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json'

# Define the columns to extract
cols = ['id', 'title', 'abstract', 'categories', 'authors', 'comments', 'update_date']

# Initialize a Counter to store unique categories
category_counter = Counter()

# Load data and extract relevant fields + count categories
data = []
with open(file_name, encoding='latin-1') as f:
    for line in f:
        doc = json.loads(line)
        categories = doc.get('categories', '').strip()
        if categories and ' ' not in categories:  # Include only rows with a single category
            category_counter.update([categories])
            data.append([
                doc.get('id'),
                doc.get('title', ''),
                doc.get('abstract', ''),
                categories,
                doc.get('authors', ''),
                doc.get('comments', ''),
                doc.get('update_date', '')
            ])

# Print unique categories
unique_categories = list(category_counter.keys())
print(f"Total unique categories: {len(unique_categories)}")
print("Unique categories:")
print(unique_categories)

# Optionally, print the top 20 most common categories with their counts
print("\nTop 20 categories by frequency:")
for category, count in category_counter.most_common(20):
    print(f"{category}: {count}")

# Convert data to DataFrame
df = pd.DataFrame(data, columns=cols)

# Clean the DataFrame
# df['abstract'] = df['abstract'].str.strip().str.lower()
# df['title'] = df['title'].str.strip().str.lower()
# df['categories'] = df['categories'].str.strip()
# df['authors'] = df['authors'].str.strip()
# df['comments'] = df['comments'].str.strip()
df['update_date'] = pd.to_datetime(df['update_date'], errors='coerce')

# Drop rows with missing abstracts or titles
df = df.dropna(subset=['abstract', 'title'])

# Filter out categories with fewer than 10 samples
category_counts = df['categories'].value_counts()
valid_categories = category_counts[category_counts >= 50].index
print(f"\nTotal valid categories: {len(valid_categories)}")

# Filter the DataFrame to include only valid categories
df_filtered = df[df['categories'].isin(valid_categories)]

# Stratified sampling based on categories
if len(df_filtered) < 100000:
    raise ValueError(f"Not enough data to sample 100,000 rows. Available: {len(df_filtered)}")

stratified_sample, _ = train_test_split(
    df_filtered,
    train_size=100000,
    stratify=df_filtered['categories'],
    random_state=62
)

# Reset index
stratified_sample = stratified_sample.reset_index(drop=True)

# Save the final sample to a CSV file
stratified_sample.to_csv('stratified_sample.csv', index=False)

# Display summary
print(f"\nFinal dataset size: {len(stratified_sample)}")
print("Category distribution in the sample:")
print(stratified_sample['categories'].value_counts())


Total unique categories: 149
Unique categories:
['hep-ph', 'physics.gen-ph', 'math.CO', 'cond-mat.mes-hall', 'gr-qc', 'cond-mat.mtrl-sci', 'astro-ph', 'math.NT', 'hep-th', 'hep-ex', 'math.NA', 'nlin.PS', 'math.RA', 'cond-mat.str-el', 'physics.pop-ph', 'nucl-th', 'math.FA', 'cs.DS', 'math.DS', 'physics.soc-ph', 'math.AG', 'math.OA', 'math.PR', 'math.DG', 'physics.optics', 'math.GR', 'nlin.SI', 'math.SG', 'physics.data-an', 'cs.CC', 'math.GT', 'quant-ph', 'cond-mat.other', 'math.CV', 'math.AP', 'cond-mat.supr-con', 'math.RT', 'cond-mat.stat-mech', 'q-bio.OT', 'physics.plasm-ph', 'nlin.CG', 'nucl-ex', 'cond-mat.soft', 'physics.comp-ph', 'math.MG', 'math.QA', 'physics.bio-ph', 'physics.chem-ph', 'math.AT', 'physics.geo-ph', 'q-bio.BM', 'math.OC', 'cs.CR', 'physics.class-ph', 'q-bio.PE', 'q-bio.NC', 'physics.atom-ph', 'math.GM', 'hep-lat', 'math.CA', 'physics.atm-clus', 'cs.PF', 'physics.acc-ph', 'math.SP', 'nlin.CD', 'physics.hist-ph', 'physics.flu-dyn', 'cond-mat.dis-nn', 'cs.CV', 'cs.LG'

# Filtering the sample further to exclude categories with unique counts less than 20 to maintain a stratified distribution throughout

In [None]:
import pandas as pd

# Load your DataFrame
df = pd.read_csv('stratified_sample.csv')

# Get category counts
category_counts = df['categories'].value_counts()

# Identify categories with 20 or more occurrences
valid_categories = category_counts[category_counts >= 20].index

# Filter DataFrame to keep only rows with valid categories
df_filtered = df[df['categories'].isin(valid_categories)]

# Display the number of rows before and after filtering
print(f"Original dataset size: {len(df)}")
print(f"Filtered dataset size: {len(df_filtered)}")

# Save the filtered DataFrame to a new CSV file
df_filtered.to_csv('filtered_stratified_sample.csv', index=False)

# Display summary
print("Categories with fewer than 20 records have been removed.")
print("Updated category distribution:")
print(df_filtered['categories'].value_counts())


Original dataset size: 100000
Filtered dataset size: 99947
Categories with fewer than 20 records have been removed.
Updated category distribution:
categories
astro-ph    6277
hep-ph      5934
quant-ph    5159
cs.CV       4629
hep-th      4301
            ... 
cs.SD         27
cs.NA         26
cs.MS         26
q-bio.CB      25
cs.OS         23
Name: count, Length: 145, dtype: int64


# Splitting the sample into Train(70K), Validation(15K), Test(15K)

In [None]:
from sklearn.model_selection import train_test_split

# Load the stratified sample
df = df_filtered

# Split into train (70k) and remaining (30k)
train_df, remaining_df = train_test_split(
    df,
    train_size=70000,
    stratify=df['categories'],
    random_state=42
)

# Split remaining into validation (15k) and test (15k)
val_df, test_df = train_test_split(
    remaining_df,
    test_size=0.5,
    stratify=remaining_df['categories'],
    random_state=42
)

# Save the splits to CSV files
train_df.to_csv('train_df.csv', index=False)
val_df.to_csv('val_df.csv', index=False)
test_df.to_csv('test_df.csv', index=False)

# Display summary
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")


Train set size: 70000
Validation set size: 14973
Test set size: 14974


# Preprocessing data by performing necessary cleaning operations (Lowercasing, Lemmatizing, Removing punctuations, whitespace, special characters)

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the stratified sample dataset
df = pd.read_csv('train_df.csv')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning to relevant fields
df['cleaned_title'] = df['title'].apply(clean_text)
df['cleaned_authors'] = df['authors'].apply(clean_text)
df['cleaned_categories'] = df['categories'].apply(clean_text)
df['cleaned_abstract'] = df['abstract'].apply(clean_text)
df['cleaned_comments'] = df['comments'].apply(clean_text)

# Create the enhanced text field and remove newlines
df['enhanced_text'] = df.apply(lambda row: f"""
Title: {row['cleaned_title']} [SEP]
Authors: {row['cleaned_authors']} [SEP]
Categories: {row['cleaned_categories']} [SEP]
Abstract: {row['cleaned_abstract']} [SEP]
Comments: {row['cleaned_comments']} [SEP]
Updated on: {row['update_date']}
""".replace('\n', ' ').strip(), axis=1)

# Display the first few rows to verify the enhanced text field
print(df[['id', 'enhanced_text']].head())

# Save the updated DataFrame to a new CSV file
df.to_csv('enhanced_stratified_sample_train.csv', index=False)

print("Enhanced text field created and saved successfully.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


                 id                                      enhanced_text
0        2007.12657  Title: sublimative evolution 486958 arrokoth [...
1         1208.4774  Title: torii phase [SEP] Authors: emmanuel ami...
2         0903.4882  Title: kinetic monte carlo simulation strained...
3        1601.06809  Title: test field cannot destroy extremal blac...
4  astro-ph/0104478  Title: low albedo among extinct comet candidat...
Enhanced text field created and saved successfully.


# Creating the schema for storing the sentence transformer embeddings in LanceDB

In [None]:
import pandas as pd
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

# Load the enhanced dataset
df = pd.read_csv("enhanced_stratified_sample_train.csv")

# Display the first few rows to verify
print(df.head())

# Load the Sentence-Transformer model
model_name = "allenai-specter"  # You can adjust the model based on your use case
db = lancedb.connect("lancedb_directory")

# Register the embedding function
registry = get_registry()
embedding_function = registry.get("sentence-transformers").create(
    name=model_name,
    device="cuda"  # Use "cuda" for GPU; use "cpu" if GPU is not available
)

# Define the LanceDB schema with Pydantic
class TextData(LanceModel):
    id: str
    title: str
    authors: str
    abstract: str
    categories: str
    comments: str
    update_date: str
    enhanced_text: str = embedding_function.SourceField()  # Source text for embeddings
    embedding: Vector(embedding_function.ndims()) = embedding_function.VectorField()

# Create the table (overwrite if it exists)
table = db.create_table("enhanced_papers", schema=TextData, mode="overwrite")

# Convert the DataFrame to a list of dictionaries
data = df[["id", "title", "authors", "abstract","categories", "comments", "update_date", "enhanced_text"]].astype(str).to_dict(orient="records")

# Add data to the table
table.add(data)

print("Enhanced data added to the LanceDB table successfully!")


                 id                                              title  \
0        2007.12657     The Sublimative Evolution of (486958) Arrokoth   
1         1208.4774                                The Torii of phases   
2         0903.4882  Kinetic Monte Carlo Simulation of Strained Het...   
3        1601.06809    Test fields cannot destroy extremal black holes   
4  astro-ph/0104478         Low Albedos Among Extinct Comet Candidates   

                                            abstract         categories  \
0    We consider the history of New Horizons targ...        astro-ph.EP   
1    The import of the magnitude of fourier coeff...            math.HO   
2    An efficient method for the simulation of st...  cond-mat.mtrl-sci   
3    We prove that (possibly charged) test fields...              gr-qc   
4    We present radiometric effective radii and v...           astro-ph   

                                             authors  \
0  Jordan K. Steckloff, Carey M. Lisse, Taylor K

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/331 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/462k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Enhanced data added to the LanceDB table successfully!


In [None]:
import shutil

# Replace 'folder_name' with the name of your folder
shutil.make_archive('lancedb_directory', 'zip', 'lancedb_directory')

# Download the ZIP file
from google.colab import files
files.download('lancedb_directory.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Cleaning & preprocessing the test data

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the stratified sample dataset
df = pd.read_csv('test_df.csv')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


df['cleaned_abstract'] = df['abstract'].apply(clean_text)

test_df=df

In [None]:
# import pandas as pd
# import numpy as np
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
# import lancedb

# # Load the SBERT model
# model = SentenceTransformer("allenai-specter")

# # Connect to LanceDB and open the table
# db = lancedb.connect("lancedb_directory")
# table = db.open_table("enhanced_papers")

# # Input abstract for which recommendations are needed
# user_abstract = """zeroshot quantization zsq promising compressing accelerating deep neural networks data training
# fullprecision models inaccessible zsq network quantization performed using synthetic samples performance quantized
# models depends heavily quality synthetic samples nonetheless synthetic samples constructed existing zsq methods
# easily fitted models accordingly quantized models obtained methods suffer significant performance degradation
# hard samples address issue propose hard sample synthesizing training hast specifically hast pays attention hard
# samples synthesizing samples makes synthetic samples hard fit training quantized models hast aligns features
# extracted fullprecision quantized models ensure similarity features extracted models extensive experiments hast
# significantly outperforms existing zsq methods achieving performance comparable models quantized real data"""

# # Generate embedding for the input abstract
# user_embedding = model.encode(user_abstract, convert_to_tensor=True).cpu().numpy()

# # Perform similarity search in LanceDB with a limit of 5 recommendations
# recommendations = table.search(user_embedding).metric("cosine").limit(5).to_pandas()

# # Compute cosine similarity scores
# recommendation_vectors = np.vstack(recommendations['embedding'].tolist())
# cosine_similarities = cosine_similarity(user_embedding.reshape(1, -1), recommendation_vectors)[0]

# # Add similarity scores to the recommendations DataFrame
# recommendations['similarity_score'] = cosine_similarities

# # Display recommendations with title, abstract, and similarity score
# print("\nTop Recommendations:")
# for idx, row in recommendations.iterrows():
#     print(f"\nRecommendation {idx + 1}:")
#     print(f"Title: {row['title']}")
#     print(f"Abstract: {row['abstract']}")
#     print(f"Similarity Score: {row['similarity_score']:.4f}")
#     print("-" * 80)


In [None]:
# import pandas as pd
# import numpy as np
# from tqdm import tqdm
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.cluster import KMeans
# from datetime import datetime
# import lancedb
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import threading

# # Load the SBERT model
# model = SentenceTransformer("all-MiniLM-L6-v2")

# # Connect to LanceDB and open the table
# db = lancedb.connect("lancedb_directory")
# table = db.open_table("enhanced_papers")

# # Load the training data from CSV (for metadata like categories)
# train_df = pd.read_csv("train_df.csv")

# # Perform clustering on train embeddings for clustering-based ground truth
# train_embeddings = np.vstack(table.to_pandas()["embedding"].tolist())
# num_clusters = 20  # Adjust the number of clusters as needed
# kmeans = KMeans(n_clusters=num_clusters, random_state=42)
# train_df['cluster'] = kmeans.fit_predict(train_embeddings)

# # Define temporal evaluation parameters
# time_window_days = 365  # 1-year window for temporal evaluation

# # Similarity threshold
# similarity_threshold = 0.7

# # Weights for hybrid scoring
# weights = {"category": 0.3, "cluster": 0.2, "similarity": 0.3, "temporal": 0.2}

# # Precompute category counts in the training data
# category_counts = train_df["categories"].value_counts().to_dict()

# # Initialize metrics for each ground truth method and k values
# k_values = [5, 10, 15]
# metrics = {
#     k: {
#         "category": {"precision": 0, "recall": 0, "mrr": 0},
#         "clustering": {"precision": 0, "recall": 0, "mrr": 0},
#         "similarity": {"precision": 0, "recall": 0, "mrr": 0},
#         "temporal": {"precision": 0, "recall": 0, "mrr": 0},
#         "hybrid": {"precision": 0, "recall": 0, "mrr": 0},
#     }
#     for k in k_values
# }

# # Number of queries to process
# num_queries = len(test_df)

# # Lock for thread-safe metric updates
# lock = threading.Lock()

# # Evaluation functions
# def precision_at_k(recommendations, true_label, k):
#     relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
#     return relevant / k

# def recall_at_k(recommendations, true_label, all_relevant_count, k):
#     relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
#     return relevant / all_relevant_count if all_relevant_count > 0 else 0

# def mean_reciprocal_rank(recommendations, true_label, k):
#     for i, label in enumerate(recommendations["categories"].tolist()[:k]):
#         if label == true_label:
#             return 1 / (i + 1)
#     return 0

# def temporal_score(query_date, rec_date, window=time_window_days):
#     rec_date = pd.to_datetime(rec_date, errors='coerce')
#     return 1 if pd.notnull(rec_date) and abs((query_date - rec_date).days) <= window else 0

# # Function to process a single query
# def process_query(idx, query, true_category, query_date):
#     local_metrics = {k: {method: {"precision": 0, "recall": 0, "mrr": 0} for method in metrics[k]} for k in k_values}

#     # Generate embedding for the query
#     query_embedding = model.encode(query, batch_size=32, show_progress_bar=False)

#     # Perform similarity search in LanceDB with a limit of 15
#     recommendations = table.search(query_embedding).metric("cosine").limit(15).to_pandas()
#     recommendation_vectors = np.vstack(recommendations['embedding'].tolist())

#     # Precompute true cluster and all relevant count
#     all_relevant_count = category_counts.get(true_category, 0)
#     true_cluster = train_df[train_df["categories"] == true_category]["cluster"].iloc[0]

#     for k in k_values:
#         # CATEGORY-BASED EVALUATION
#         local_metrics[k]["category"]["precision"] += precision_at_k(recommendations, true_category, k)
#         local_metrics[k]["category"]["recall"] += recall_at_k(recommendations, true_category, all_relevant_count, k)
#         local_metrics[k]["category"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

#         # CLUSTERING-BASED EVALUATION
#         predicted_clusters = recommendations["categories"].map(lambda cat: train_df[train_df["categories"] == cat]["cluster"].iloc[0])
#         relevant_clusters = sum(1 for cluster in predicted_clusters[:k] if cluster == true_cluster)
#         local_metrics[k]["clustering"]["precision"] += relevant_clusters / k
#         local_metrics[k]["clustering"]["recall"] += relevant_clusters / all_relevant_count if all_relevant_count > 0 else 0
#         local_metrics[k]["clustering"]["mrr"] += mean_reciprocal_rank(recommendations, true_cluster, k)

#         # SIMILARITY-BASED EVALUATION
#         cosine_similarities = cosine_similarity(query_embedding.reshape(1, -1), recommendation_vectors)[0]
#         relevant_similar = sum(1 for score in cosine_similarities[:k] if score >= similarity_threshold)
#         local_metrics[k]["similarity"]["precision"] += relevant_similar / k
#         local_metrics[k]["similarity"]["recall"] += relevant_similar / all_relevant_count if all_relevant_count > 0 else 0
#         local_metrics[k]["similarity"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

#         # TEMPORAL-BASED EVALUATION
#         relevant_temporal = sum(1 for rec_date in recommendations["update_date"][:k] if temporal_score(query_date, rec_date))
#         local_metrics[k]["temporal"]["precision"] += relevant_temporal / k
#         local_metrics[k]["temporal"]["recall"] += relevant_temporal / all_relevant_count if all_relevant_count > 0 else 0
#         local_metrics[k]["temporal"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

#     return local_metrics

# # Process queries in parallel
# with ThreadPoolExecutor(max_workers=8) as executor:
#     futures = [
#         executor.submit(process_query, idx, row["cleaned_abstract"], row["categories"], pd.to_datetime(row["update_date"], errors='coerce'))
#         for idx, row in test_df.iterrows()
#     ]

#     for future in tqdm(as_completed(futures), total=num_queries, desc="Processing Results"):
#         result = future.result()
#         with lock:
#             for k in k_values:
#                 for method in result[k]:
#                     for metric in result[k][method]:
#                         metrics[k][method][metric] += result[k][method][metric]

# # Compute average metrics
# for k in k_values:
#     for method in metrics[k]:
#         for metric in metrics[k][method]:
#             metrics[k][method][metric] /= num_queries

# # Display final metrics
# print("\nFinal Metrics:")
# for k in k_values:
#     print(f"\nMetrics for k={k}:")
#     for method, scores in metrics[k].items():
#         print(f"\n{method.capitalize()} Ground Truth:")
#         print(f" - Average Precision@{k}: {scores['precision']:.2f}")
#         print(f" - Average Recall@{k}: {scores['recall']:.2f}")
#         print(f" - Average MRR@{k}: {scores['mrr']:.2f}")


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from datetime import datetime
import lancedb

# Load the SBERT model
model = SentenceTransformer("allenai-specter")

# Connect to LanceDB and open the table
db = lancedb.connect("lancedb_directory")
table = db.open_table("enhanced_papers")

# Load the training data from CSV (for metadata like categories)
train_df = pd.read_csv("train_df.csv")

# Perform clustering on train embeddings for clustering-based ground truth
train_embeddings = np.vstack(table.to_pandas()["embedding"].tolist())
num_clusters = 20  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
train_df['cluster'] = kmeans.fit_predict(train_embeddings)

# Define temporal evaluation parameters
time_window_days = 365  # 1-year window for temporal evaluation

# Similarity threshold
similarity_threshold = 0.7

# Weights for hybrid scoring
weights = {"category": 0.3, "cluster": 0.2, "similarity": 0.3, "temporal": 0.2}

# Precompute category counts in the training data
category_counts = train_df["categories"].value_counts().to_dict()

# Initialize metrics for each ground truth method and k values
k_values = [5, 10, 15]
metrics = {
    k: {
        "category": {"precision": 0, "recall": 0, "mrr": 0},
        "clustering": {"precision": 0, "recall": 0, "mrr": 0},
        "similarity": {"precision": 0, "recall": 0, "mrr": 0},
        "temporal": {"precision": 0, "recall": 0, "mrr": 0},
        "hybrid": {"precision": 0, "recall": 0, "mrr": 0},
    }
    for k in k_values
}

# Number of queries to process
num_queries = 5000

# Define batch size
batch_size = 100

# Evaluation functions
def precision_at_k(recommendations, true_label, k):
    relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
    return relevant / k

def recall_at_k(recommendations, true_label, all_relevant_count, k):
    relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
    return relevant / all_relevant_count if all_relevant_count > 0 else 0

def mean_reciprocal_rank(recommendations, true_label, k):
    for i, label in enumerate(recommendations["categories"].tolist()[:k]):
        if label == true_label:
            return 1 / (i + 1)
    return 0

def temporal_score(query_date, rec_date, window=time_window_days):
    rec_date = pd.to_datetime(rec_date, errors='coerce')
    return 1 if pd.notnull(rec_date) and abs((query_date - rec_date).days) <= window else 0

# Process queries in batches
for start in tqdm(range(0, num_queries, batch_size), desc="Processing Batches"):
    end = min(start + batch_size, num_queries)

    # Extract batch queries, true categories, and update dates
    batch_queries = test_df.iloc[start:end]["cleaned_abstract"].tolist()
    batch_categories = test_df.iloc[start:end]["categories"].values
    batch_dates = pd.to_datetime(test_df.iloc[start:end]["update_date"], errors='coerce').values

    # Generate embeddings for the batch of queries
    query_embeddings = model.encode(batch_queries, batch_size=32, show_progress_bar=False)

    for idx, (query_embedding, true_category, query_date) in enumerate(zip(query_embeddings, batch_categories, batch_dates)):
        # Perform similarity search in LanceDB with a limit of 15
        recommendations = table.search(query_embedding).metric("cosine").limit(15).to_pandas()
        recommendation_vectors = np.vstack(recommendations['embedding'].tolist())

        # Precompute the true cluster and relevant counts
        all_relevant_count = category_counts.get(true_category, 0)
        true_cluster = train_df[train_df["categories"] == true_category]["cluster"].iloc[0]

        for k in k_values:
            # CATEGORY-BASED EVALUATION
            metrics[k]["category"]["precision"] += precision_at_k(recommendations, true_category, k)
            metrics[k]["category"]["recall"] += recall_at_k(recommendations, true_category, all_relevant_count, k)
            metrics[k]["category"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

            # CLUSTERING-BASED EVALUATION
            predicted_clusters = recommendations["categories"].map(lambda cat: train_df[train_df["categories"] == cat]["cluster"].iloc[0])
            relevant_clusters = sum(1 for cluster in predicted_clusters[:k] if cluster == true_cluster)
            metrics[k]["clustering"]["precision"] += relevant_clusters / k
            metrics[k]["clustering"]["recall"] += relevant_clusters / all_relevant_count if all_relevant_count > 0 else 0
            metrics[k]["clustering"]["mrr"] += mean_reciprocal_rank(recommendations, true_cluster, k)

            # SIMILARITY-BASED EVALUATION
            cosine_similarities = cosine_similarity(query_embedding.reshape(1, -1), recommendation_vectors)[0]
            relevant_similar = sum(1 for score in cosine_similarities[:k] if score >= similarity_threshold)
            metrics[k]["similarity"]["precision"] += relevant_similar / k
            metrics[k]["similarity"]["recall"] += relevant_similar / all_relevant_count if all_relevant_count > 0 else 0
            metrics[k]["similarity"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

            # HYBRID SCORING
            hybrid_score = (
                weights["category"] * precision_at_k(recommendations, true_category, k) +
                weights["cluster"] * (relevant_clusters / k) +
                weights["similarity"] * (relevant_similar / k)
            )
            metrics[k]["hybrid"]["precision"] += hybrid_score
            metrics[k]["hybrid"]["recall"] += hybrid_score
            metrics[k]["hybrid"]["mrr"] += hybrid_score

# Compute average metrics
for k in k_values:
    for method in metrics[k]:
        metrics[k][method]["precision"] /= num_queries
        metrics[k][method]["recall"] /= num_queries
        metrics[k][method]["mrr"] /= num_queries

# Display final metrics
print("\nFinal Metrics:")
for k in k_values:
    print(f"\nMetrics for k={k}:")
    for method, scores in metrics[k].items():
        print(f"\n{method.capitalize()} Ground Truth:")
        print(f" - Average Precision@{k}: {scores['precision']:.2f}")
        print(f" - Average Recall@{k}: {scores['recall']:.2f}")
        print(f" - Average MRR: {scores['mrr']:.2f}")


Processing Batches: 100%|██████████| 50/50 [51:11<00:00, 61.43s/it]


Final Metrics:

Metrics for k=5:

Category Ground Truth:
 - Average Precision@5: 0.55
 - Average Recall@5: 0.00
 - Average MRR: 0.68

Clustering Ground Truth:
 - Average Precision@5: 0.67
 - Average Recall@5: 0.01
 - Average MRR: 0.00

Similarity Ground Truth:
 - Average Precision@5: 1.00
 - Average Recall@5: 0.01
 - Average MRR: 0.68

Temporal Ground Truth:
 - Average Precision@5: 0.00
 - Average Recall@5: 0.00
 - Average MRR: 0.00

Hybrid Ground Truth:
 - Average Precision@5: 0.60
 - Average Recall@5: 0.60
 - Average MRR: 0.60

Metrics for k=10:

Category Ground Truth:
 - Average Precision@10: 0.52
 - Average Recall@10: 0.01
 - Average MRR: 0.69

Clustering Ground Truth:
 - Average Precision@10: 0.65
 - Average Recall@10: 0.01
 - Average MRR: 0.00

Similarity Ground Truth:
 - Average Precision@10: 1.00
 - Average Recall@10: 0.02
 - Average MRR: 0.69

Temporal Ground Truth:
 - Average Precision@10: 0.00
 - Average Recall@10: 0.00
 - Average MRR: 0.00

Hybrid Ground Truth:
 - Average 


