# Installing necessary packages

In [1]:
!pip install lancedb
!pip install sentence_transformers
!pip install datasets

Collecting lancedb
  Downloading lancedb-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (4.7 kB)
Collecting deprecation (from lancedb)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting pylance==0.20.0 (from lancedb)
  Downloading pylance-0.20.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (7.4 kB)
Collecting overrides>=0.7 (from lancedb)
  Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading lancedb-0.17.0-cp39-abi3-manylinux_2_28_x86_64.whl (29.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.9/29.9 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pylance-0.20.0-cp39-abi3-manylinux_2_28_x86_64.whl (33.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading overrides-7.7.0-py3-none-any.whl (17 kB)
Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: overrides, deprecat

In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


MessageError: Error: credential propagation was unsuccessful

In [None]:
import dask.bag as db
import json
data = db.read_text('/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json').map(json.loads)

In [None]:
data.take(1)

# Building a Stratified Sample of 100K records based on the category in the dataset

In [None]:
import pandas as pd
import json
from collections import Counter
from sklearn.model_selection import train_test_split

# Define the file path
file_name = '/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json'

# Define the columns to extract
cols = ['id', 'title', 'abstract', 'categories', 'authors', 'comments', 'update_date']

# Initialize a Counter to store unique categories
category_counter = Counter()

# Load data and extract relevant fields + count categories
data = []
with open(file_name, encoding='latin-1') as f:
    for line in f:
        doc = json.loads(line)
        categories = doc.get('categories', '').strip()
        if categories and ' ' not in categories:  # Include only rows with a single category
            category_counter.update([categories])
            data.append([
                doc.get('id'),
                doc.get('title', ''),
                doc.get('abstract', ''),
                categories,
                doc.get('authors', ''),
                doc.get('comments', ''),
                doc.get('update_date', '')
            ])

# Print unique categories
unique_categories = list(category_counter.keys())
print(f"Total unique categories: {len(unique_categories)}")
print("Unique categories:")
print(unique_categories)

# Optionally, print the top 20 most common categories with their counts
print("\nTop 20 categories by frequency:")
for category, count in category_counter.most_common(20):
    print(f"{category}: {count}")

# Convert data to DataFrame
df = pd.DataFrame(data, columns=cols)

# Clean the DataFrame
df['abstract'] = df['abstract'].str.strip().str.lower()
df['title'] = df['title'].str.strip().str.lower()
df['categories'] = df['categories'].str.strip()
df['authors'] = df['authors'].str.strip()
df['comments'] = df['comments'].str.strip()
df['update_date'] = pd.to_datetime(df['update_date'], errors='coerce')

# Drop rows with missing abstracts or titles
df = df.dropna(subset=['abstract', 'title'])

# Filter out categories with fewer than 10 samples
category_counts = df['categories'].value_counts()
valid_categories = category_counts[category_counts >= 50].index
print(f"\nTotal valid categories: {len(valid_categories)}")

# Filter the DataFrame to include only valid categories
df_filtered = df[df['categories'].isin(valid_categories)]

# Stratified sampling based on categories
if len(df_filtered) < 100000:
    raise ValueError(f"Not enough data to sample 100,000 rows. Available: {len(df_filtered)}")

stratified_sample, _ = train_test_split(
    df_filtered,
    train_size=100000,
    stratify=df_filtered['categories'],
    random_state=62
)

# Reset index
stratified_sample = stratified_sample.reset_index(drop=True)

# Save the final sample to a CSV file
stratified_sample.to_csv('stratified_sample.csv', index=False)

# Display summary
print(f"\nFinal dataset size: {len(stratified_sample)}")
print("Category distribution in the sample:")
print(stratified_sample['categories'].value_counts())


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json'

# Filtering the sample further to exclude categories with unique counts less than 20 to maintain a stratified distribution throughout

In [3]:
import pandas as pd

# Load your DataFrame
df = pd.read_csv('stratified_sample.csv')

# Get category counts
category_counts = df['categories'].value_counts()

# Identify categories with 20 or more occurrences
valid_categories = category_counts[category_counts >= 20].index

# Filter DataFrame to keep only rows with valid categories
df_filtered = df[df['categories'].isin(valid_categories)]

# Display the number of rows before and after filtering
print(f"Original dataset size: {len(df)}")
print(f"Filtered dataset size: {len(df_filtered)}")

# Save the filtered DataFrame to a new CSV file
df_filtered.to_csv('filtered_stratified_sample.csv', index=False)

# Display summary
print("Categories with fewer than 20 records have been removed.")
print("Updated category distribution:")
print(df_filtered['categories'].value_counts())


Original dataset size: 100000
Filtered dataset size: 99947
Categories with fewer than 20 records have been removed.
Updated category distribution:
categories
astro-ph    6277
hep-ph      5934
quant-ph    5159
cs.CV       4629
hep-th      4301
            ... 
cs.SD         27
cs.NA         26
cs.MS         26
q-bio.CB      25
cs.OS         23
Name: count, Length: 145, dtype: int64


# Splitting the sample into Train(70K), Validation(15K), Test(15K)

In [4]:
from sklearn.model_selection import train_test_split

# Load the stratified sample
df = df_filtered

# Split into train (70k) and remaining (30k)
train_df, remaining_df = train_test_split(
    df,
    train_size=70000,
    stratify=df['categories'],
    random_state=42
)

# Split remaining into validation (15k) and test (15k)
val_df, test_df = train_test_split(
    remaining_df,
    test_size=0.5,
    stratify=remaining_df['categories'],
    random_state=42
)

# Save the splits to CSV files
train_df.to_csv('train_df.csv', index=False)
val_df.to_csv('val_df.csv', index=False)
test_df.to_csv('test_df.csv', index=False)

# Display summary
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")


Train set size: 70000
Validation set size: 14973
Test set size: 14974


# Preprocessing data by performing necessary cleaning operations (Lowercasing, Lemmatizing, Removing punctuations, whitespace, special characters)

In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the stratified sample dataset
df = pd.read_csv('train_df.csv')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning to relevant fields
df['cleaned_title'] = df['title'].apply(clean_text)
df['cleaned_authors'] = df['authors'].apply(clean_text)
df['cleaned_categories'] = df['categories'].apply(clean_text)
df['cleaned_abstract'] = df['abstract'].apply(clean_text)
df['cleaned_comments'] = df['comments'].apply(clean_text)

# Create the enhanced text field and remove newlines
df['enhanced_text'] = df.apply(lambda row: f"""
Title: {row['cleaned_title']} [SEP]
Authors: {row['cleaned_authors']} [SEP]
Categories: {row['cleaned_categories']} [SEP]
Abstract: {row['cleaned_abstract']} [SEP]
Comments: {row['cleaned_comments']} [SEP]
Updated on: {row['update_date']}
""".replace('\n', ' ').strip(), axis=1)

# Display the first few rows to verify the enhanced text field
print(df[['id', 'enhanced_text']].head())

# Save the updated DataFrame to a new CSV file
df.to_csv('enhanced_stratified_sample_train.csv', index=False)

print("Enhanced text field created and saved successfully.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


                 id                                      enhanced_text
0        2007.12657  Title: sublimative evolution 486958 arrokoth [...
1         1208.4774  Title: torii phase [SEP] Authors: emmanuel ami...
2         0903.4882  Title: kinetic monte carlo simulation strained...
3        1601.06809  Title: test field cannot destroy extremal blac...
4  astro-ph/0104478  Title: low albedo among extinct comet candidat...
Enhanced text field created and saved successfully.


# Cleaning & Preprocessing the validation data

In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the validation dataset
val_df = pd.read_csv('val_df.csv')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Clean the 'abstract' field
val_df['cleaned_abstract'] = val_df['abstract'].apply(clean_text)

# Save the cleaned validation data
val_df.to_csv('cleaned_val_df.csv', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Finetuning the all_miniLM_L6_v2 model

In [7]:
import pandas as pd
import random
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments # Import AutoModelForSequenceClassification
from sentence_transformers import losses
from torch.utils.data import DataLoader

# Load the dataset
file_path = "enhanced_stratified_sample_train.csv"
df = pd.read_csv(file_path)

# Extract abstracts and categories
df = df.dropna(subset=['abstract', 'categories'])
val_df = pd.read_csv('cleaned_val_df.csv')
# Function to generate annotated pairs based on categories
def generate_pairs_with_categories(df, num_pairs=10000, positive_ratio=0.5):
    pairs = []
    num_positive = int(num_pairs * positive_ratio)
    num_negative = num_pairs - num_positive

    # Group abstracts by category
    category_groups = df.groupby('categories')['abstract'].apply(list).to_dict()

    # Positive pairs (same category)
    for _ in range(num_positive):
        category = random.choice(list(category_groups.keys()))
        abstracts_in_category = category_groups[category]
        if len(abstracts_in_category) > 1:
            abstract1, abstract2 = random.sample(abstracts_in_category, 2)
            pairs.append({"sentence1": abstract1, "sentence2": abstract2, "label": 1})

    # Negative pairs (different categories)
    categories = list(category_groups.keys())
    for _ in range(num_negative):
        category1, category2 = random.sample(categories, 2)
        abstract1 = random.choice(category_groups[category1])
        abstract2 = random.choice(category_groups[category2])
        pairs.append({"sentence1": abstract1, "sentence2": abstract2, "label": 0})

    return pairs

# Generate annotated pairs using categories
pairs = generate_pairs_with_categories(df, num_pairs=10000, positive_ratio=0.5)
valpairs = generate_pairs_with_categories(val_df, num_pairs=3000, positive_ratio=0.5)
# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_list(pairs)
val_dataset = Dataset.from_list(valpairs)
# Tokenizer and model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # Assuming binary classification (0 or 1)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=512)

hf_dataset = hf_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
# Define training arguments
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./finetuned_all_minilm_l6_v2")
tokenizer.save_pretrained("./finetuned_all_minilm_l6_v2")

print("Model fine-tuned and saved successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,0.42,0.490723


Model fine-tuned and saved successfully!


In [9]:
from sentence_transformers import SentenceTransformer

# Wrap the model in SentenceTransformer and save it
sentence_model = SentenceTransformer(model_name)
sentence_model._modules['0'].auto_model = model  # Replace the underlying AutoModel with the fine-tuned model
sentence_model.save("./finetuned_all_minilm_l6_v2")


In [15]:
# Replace 'folder_name' with your actual folder name
!zip -r finetuned_all_minilm_l6_v2.zip finetuned_all_minilm_l6_v2

# Download the ZIP file
from google.colab import files
files.download('finetuned_all_minilm_l6_v2.zip')


  adding: finetuned_all_minilm_l6_v2/ (stored 0%)
  adding: finetuned_all_minilm_l6_v2/README.md (deflated 64%)
  adding: finetuned_all_minilm_l6_v2/config_sentence_transformers.json (deflated 35%)
  adding: finetuned_all_minilm_l6_v2/2_Normalize/ (stored 0%)
  adding: finetuned_all_minilm_l6_v2/config.json (deflated 47%)
  adding: finetuned_all_minilm_l6_v2/model.safetensors (deflated 9%)
  adding: finetuned_all_minilm_l6_v2/vocab.txt (deflated 53%)
  adding: finetuned_all_minilm_l6_v2/tokenizer.json (deflated 71%)
  adding: finetuned_all_minilm_l6_v2/sentence_bert_config.json (deflated 4%)
  adding: finetuned_all_minilm_l6_v2/1_Pooling/ (stored 0%)
  adding: finetuned_all_minilm_l6_v2/1_Pooling/config.json (deflated 57%)
  adding: finetuned_all_minilm_l6_v2/tokenizer_config.json (deflated 74%)
  adding: finetuned_all_minilm_l6_v2/modules.json (deflated 62%)
  adding: finetuned_all_minilm_l6_v2/special_tokens_map.json (deflated 80%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Cleaning & Preprocessing Test data

In [10]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the stratified sample dataset
df = pd.read_csv('test_df.csv')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


df['cleaned_abstract'] = df['abstract'].apply(clean_text)


test_df = df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# import numpy as np
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# from tqdm import tqdm

# # Load the fine-tuned model
# model = SentenceTransformer("./finetuned_all_minilm_l6_v2")

# # Load the training data
# train_df = pd.read_csv("/content/enhanced_stratified_sample_train.csv")

# # Ensure there are no missing abstracts
# train_df = train_df.dropna(subset=["enhanced_text"])

# # Generate embeddings for each abstract in the training data
# train_embeddings = []
# for abstract in tqdm(train_df["enhanced_text"].tolist(), desc="Generating Train Embeddings"):
#     embedding = model.encode(abstract, show_progress_bar=False)
#     train_embeddings.append(embedding)

# # Convert embeddings to a NumPy array
# train_embeddings = np.array(train_embeddings)

# # Save the embeddings to a .npy file
# np.save("train_embeddings.npy", train_embeddings)

# print("Train embeddings generated and saved successfully as 'train_embeddings.npy'!")


# Generate and store new embeddings in LanceDB


In [12]:
import pandas as pd
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

# Load the enhanced dataset
df = pd.read_csv("enhanced_stratified_sample_train.csv")

# Display the first few rows to verify
print(df.head())

# Load the Sentence-Transformer model
model_name = "./finetuned_all_minilm_l6_v2"  # You can adjust the model based on your use case
db = lancedb.connect("lancedb_directory")

# Register the embedding function
registry = get_registry()
embedding_function = registry.get("sentence-transformers").create(
    name=model_name,
    device="cuda"  # Use "cuda" for GPU; use "cpu" if GPU is not available
)

# Define the LanceDB schema with Pydantic
class TextData(LanceModel):
    id: str
    title: str
    authors: str
    abstract: str
    categories: str
    comments: str
    update_date: str
    enhanced_text: str = embedding_function.SourceField()  # Source text for embeddings
    embedding: Vector(embedding_function.ndims()) = embedding_function.VectorField()

# Create the table (overwrite if it exists)
table = db.create_table("enhanced_papers_finetuned", schema=TextData, mode="overwrite")

# Convert the DataFrame to a list of dictionaries
data = df[["id", "title", "authors", "abstract","categories", "comments", "update_date", "enhanced_text"]].astype(str).to_dict(orient="records")

# Add data to the table
table.add(data)

print("Enhanced data added to the LanceDB table successfully!")


                 id                                              title  \
0        2007.12657     the sublimative evolution of (486958) arrokoth   
1         1208.4774                                the torii of phases   
2         0903.4882  kinetic monte carlo simulation of strained het...   
3        1601.06809    test fields cannot destroy extremal black holes   
4  astro-ph/0104478         low albedos among extinct comet candidates   

                                            abstract         categories  \
0  we consider the history of new horizons target...        astro-ph.EP   
1  the import of the magnitude of fourier coeffic...            math.HO   
2  an efficient method for the simulation of stra...  cond-mat.mtrl-sci   
3  we prove that (possibly charged) test fields s...              gr-qc   
4  we present radiometric effective radii and vis...           astro-ph   

                                             authors  \
0  Jordan K. Steckloff, Carey M. Lisse, Taylor K

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from datetime import datetime
import lancedb
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Load the SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Connect to LanceDB and open the table
db = lancedb.connect("lancedb_directory")
table = db.open_table("enhanced_papers_finetuned")

# Load the training data from CSV (for metadata like categories)
train_df = pd.read_csv("train_df.csv")

# Perform clustering on train embeddings for clustering-based ground truth
train_embeddings = np.vstack(table.to_pandas()["embedding"].tolist())
num_clusters = 20  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
train_df['cluster'] = kmeans.fit_predict(train_embeddings)

# Define temporal evaluation parameters
time_window_days = 365  # 1-year window for temporal evaluation

# Similarity threshold
similarity_threshold = 0.7

# Weights for hybrid scoring
weights = {"category": 0.3, "cluster": 0.2, "similarity": 0.3, "temporal": 0.2}

# Precompute category counts in the training data
category_counts = train_df["categories"].value_counts().to_dict()

# Initialize metrics for each ground truth method and k values
k_values = [5, 10, 15]
metrics = {
    k: {
        "category": {"precision": 0, "recall": 0, "mrr": 0},
        "clustering": {"precision": 0, "recall": 0, "mrr": 0},
        "similarity": {"precision": 0, "recall": 0, "mrr": 0},
        "temporal": {"precision": 0, "recall": 0, "mrr": 0},
        "hybrid": {"precision": 0, "recall": 0, "mrr": 0},
    }
    for k in k_values
}

# Number of queries to process
num_queries = 5000

# Lock for thread-safe metric updates
lock = threading.Lock()

# Evaluation functions
def precision_at_k(recommendations, true_label, k):
    relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
    return relevant / k

def recall_at_k(recommendations, true_label, all_relevant_count, k):
    relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
    return relevant / all_relevant_count if all_relevant_count > 0 else 0

def mean_reciprocal_rank(recommendations, true_label, k):
    for i, label in enumerate(recommendations["categories"].tolist()[:k]):
        if label == true_label:
            return 1 / (i + 1)
    return 0

def temporal_score(query_date, rec_date, window=time_window_days):
    rec_date = pd.to_datetime(rec_date, errors='coerce')
    return 1 if pd.notnull(rec_date) and abs((query_date - rec_date).days) <= window else 0

# Function to process a single query
def process_query(idx, query, true_category, query_date):
    local_metrics = {k: {method: {"precision": 0, "recall": 0, "mrr": 0} for method in metrics[k]} for k in k_values}

    # Generate embedding for the query
    query_embedding = model.encode(query, batch_size=32, show_progress_bar=False)

    # Perform similarity search in LanceDB with a limit of 15
    recommendations = table.search(query_embedding).metric("cosine").limit(15).to_pandas()
    recommendation_vectors = np.vstack(recommendations['embedding'].tolist())

    # Precompute true cluster and all relevant count
    all_relevant_count = category_counts.get(true_category, 0)
    true_cluster = train_df[train_df["categories"] == true_category]["cluster"].iloc[0]

    for k in k_values:
        # CATEGORY-BASED EVALUATION
        local_metrics[k]["category"]["precision"] += precision_at_k(recommendations, true_category, k)
        local_metrics[k]["category"]["recall"] += recall_at_k(recommendations, true_category, all_relevant_count, k)
        local_metrics[k]["category"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

        # CLUSTERING-BASED EVALUATION
        predicted_clusters = recommendations["categories"].map(lambda cat: train_df[train_df["categories"] == cat]["cluster"].iloc[0])
        relevant_clusters = sum(1 for cluster in predicted_clusters[:k] if cluster == true_cluster)
        local_metrics[k]["clustering"]["precision"] += relevant_clusters / k
        local_metrics[k]["clustering"]["recall"] += relevant_clusters / all_relevant_count if all_relevant_count > 0 else 0
        local_metrics[k]["clustering"]["mrr"] += mean_reciprocal_rank(recommendations, true_cluster, k)

        # SIMILARITY-BASED EVALUATION
        cosine_similarities = cosine_similarity(query_embedding.reshape(1, -1), recommendation_vectors)[0]
        relevant_similar = sum(1 for score in cosine_similarities[:k] if score >= similarity_threshold)
        local_metrics[k]["similarity"]["precision"] += relevant_similar / k
        local_metrics[k]["similarity"]["recall"] += relevant_similar / all_relevant_count if all_relevant_count > 0 else 0
        local_metrics[k]["similarity"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

        # TEMPORAL-BASED EVALUATION
        relevant_temporal = sum(1 for rec_date in recommendations["update_date"][:k] if temporal_score(query_date, rec_date))
        local_metrics[k]["temporal"]["precision"] += relevant_temporal / k
        local_metrics[k]["temporal"]["recall"] += relevant_temporal / all_relevant_count if all_relevant_count > 0 else 0
        local_metrics[k]["temporal"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

    return local_metrics

# Process queries in parallel
with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [
        executor.submit(process_query, idx, row["cleaned_abstract"], row["categories"], pd.to_datetime(row["update_date"], errors='coerce'))
        for idx, row in test_df.iterrows()
    ]

    for future in tqdm(as_completed(futures), total=num_queries, desc="Processing Results"):
        result = future.result()
        with lock:
            for k in k_values:
                for method in result[k]:
                    for metric in result[k][method]:
                        metrics[k][method][metric] += result[k][method][metric]

# Compute average metrics
for k in k_values:
    for method in metrics[k]:
        for metric in metrics[k][method]:
            metrics[k][method][metric] /= num_queries

# Display final metrics
print("\nFinal Metrics:")
for k in k_values:
    print(f"\nMetrics for k={k}:")
    for method, scores in metrics[k].items():
        print(f"\n{method.capitalize()} Ground Truth:")
        print(f" - Average Precision@{k}: {scores['precision']:.2f}")
        print(f" - Average Recall@{k}: {scores['recall']:.2f}")
        print(f" - Average MRR@{k}: {scores['mrr']:.2f}")


Processing Results:  81%|████████  | 4059/5000 [50:42<16:30,  1.05s/it]

In [None]:
# import pandas as pd
# import numpy as np
# from tqdm import tqdm
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.cluster import KMeans
# from datetime import datetime
# import lancedb

# # Load the SBERT model
# model = SentenceTransformer("./finetuned_all_minilm_l6_v2")

# # Connect to LanceDB and open the table
# db = lancedb.connect("lancedb_directory")
# table = db.open_table("enhanced_papers")

# # Load the training data from CSV (for metadata like categories)
# train_df = pd.read_csv("train_df.csv")
# # Load the training data and precomputed embeddings
# train_df = pd.read_csv("train_df.csv")
# train_embeddings = np.load("train_embeddings.npy")

# # Ensure train_df and train_embeddings are aligned
# assert len(train_df) == len(train_embeddings), "Mismatch between train_df and train_embeddings lengths"
# # Perform clustering on train embeddings for clustering-based ground truth
# # train_embeddings = np.vstack(table.to_pandas()["embedding"].tolist())
# num_clusters = 20  # Adjust the number of clusters as needed
# kmeans = KMeans(n_clusters=num_clusters, random_state=42)
# train_df['cluster'] = kmeans.fit_predict(train_embeddings)

# # Define temporal evaluation parameters
# time_window_days = 365  # 1-year window for temporal evaluation

# # Similarity threshold
# similarity_threshold = 0.7

# # Weights for hybrid scoring
# weights = {"category": 0.3, "cluster": 0.2, "similarity": 0.3, "temporal": 0.2}

# # Precompute category counts in the training data
# category_counts = train_df["categories"].value_counts().to_dict()

# # Initialize metrics for each ground truth method and k values
# k_values = [5, 10, 15]
# metrics = {
#     k: {
#         "category": {"precision": 0, "recall": 0, "mrr": 0},
#         "clustering": {"precision": 0, "recall": 0, "mrr": 0},
#         "similarity": {"precision": 0, "recall": 0, "mrr": 0},
#         "temporal": {"precision": 0, "recall": 0, "mrr": 0},
#         "hybrid": {"precision": 0, "recall": 0, "mrr": 0},
#     }
#     for k in k_values
# }

# # Number of queries to process
# num_queries = 5000

# # Define batch size
# batch_size = 100

# # Evaluation functions
# def precision_at_k(recommendations, true_label, k):
#     relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
#     return relevant / k

# def recall_at_k(recommendations, true_label, all_relevant_count, k):
#     relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
#     return relevant / all_relevant_count if all_relevant_count > 0 else 0

# def mean_reciprocal_rank(recommendations, true_label, k):
#     for i, label in enumerate(recommendations["categories"].tolist()[:k]):
#         if label == true_label:
#             return 1 / (i + 1)
#     return 0

# def temporal_score(query_date, rec_date, window=time_window_days):
#     rec_date = pd.to_datetime(rec_date, errors='coerce')
#     return 1 if pd.notnull(rec_date) and abs((query_date - rec_date).days) <= window else 0

# # Process queries in batches
# for start in tqdm(range(0, num_queries, batch_size), desc="Processing Batches"):
#     end = min(start + batch_size, num_queries)

#     # Extract batch queries, true categories, and update dates
#     batch_queries = test_df.iloc[start:end]["cleaned_abstract"].tolist()
#     batch_categories = test_df.iloc[start:end]["categories"].values
#     batch_dates = pd.to_datetime(test_df.iloc[start:end]["update_date"], errors='coerce').values

#     # Generate embeddings for the batch of queries
#     query_embeddings = model.encode(batch_queries, batch_size=32, show_progress_bar=False)

#     for idx, (query_embedding, true_category, query_date) in enumerate(zip(query_embeddings, batch_categories, batch_dates)):
#         # Perform similarity search in LanceDB with a limit of 15
#         recommendations = table.search(query_embedding).metric("cosine").limit(15).to_pandas()
#         recommendation_vectors = np.vstack(recommendations['embedding'].tolist())

#         # Precompute true cluster and all relevant count
#         all_relevant_count = category_counts.get(true_category, 0)
#         true_cluster = train_df[train_df["categories"] == true_category]["cluster"].iloc[0]

#         for k in k_values:
#             # CATEGORY-BASED EVALUATION
#             metrics[k]["category"]["precision"] += precision_at_k(recommendations, true_category, k)
#             metrics[k]["category"]["recall"] += recall_at_k(recommendations, true_category, all_relevant_count, k)
#             metrics[k]["category"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

#             # CLUSTERING-BASED EVALUATION
#             predicted_clusters = recommendations["categories"].map(lambda cat: train_df[train_df["categories"] == cat]["cluster"].iloc[0])
#             relevant_clusters = sum(1 for cluster in predicted_clusters[:k] if cluster == true_cluster)
#             metrics[k]["clustering"]["precision"] += relevant_clusters / k
#             metrics[k]["clustering"]["recall"] += relevant_clusters / all_relevant_count if all_relevant_count > 0 else 0
#             metrics[k]["clustering"]["mrr"] += mean_reciprocal_rank(recommendations, true_cluster, k)

#             # SIMILARITY-BASED EVALUATION
#             cosine_similarities = cosine_similarity(query_embedding.reshape(1, -1), recommendation_vectors)[0]
#             relevant_similar = sum(1 for score in cosine_similarities[:k] if score >= similarity_threshold)
#             metrics[k]["similarity"]["precision"] += relevant_similar / k
#             metrics[k]["similarity"]["recall"] += relevant_similar / all_relevant_count if all_relevant_count > 0 else 0
#             metrics[k]["similarity"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

#             # TEMPORAL-BASED EVALUATION
#             relevant_temporal = sum(1 for rec_date in recommendations["update_date"][:k] if temporal_score(query_date, rec_date))
#             metrics[k]["temporal"]["precision"] += relevant_temporal / k
#             metrics[k]["temporal"]["recall"] += relevant_temporal / all_relevant_count if all_relevant_count > 0 else 0
#             metrics[k]["temporal"]["mrr"] += mean_reciprocal_rank(recommendations, true_category, k)

#             # HYBRID SCORING
#             hybrid_score = (
#                 weights["category"] * precision_at_k(recommendations, true_category, k) +
#                 weights["cluster"] * (relevant_clusters / k) +
#                 weights["similarity"] * (relevant_similar / k) +
#                 weights["temporal"] * (relevant_temporal / k)
#             )
#             metrics[k]["hybrid"]["precision"] += hybrid_score
#             metrics[k]["hybrid"]["recall"] += hybrid_score
#             metrics[k]["hybrid"]["mrr"] += hybrid_score

# # Compute average metrics
# for k in k_values:
#     for method in metrics[k]:
#         metrics[k][method]["precision"] /= num_queries
#         metrics[k][method]["recall"] /= num_queries
#         metrics[k][method]["mrr"] /= num_queries

# # Display final metrics
# print("\nFinal Metrics:")
# for k in k_values:
#     print(f"\nMetrics for k={k}:")
#     for method, scores in metrics[k].items():
#         print(f"\n{method.capitalize()} Ground Truth:")
#         print(f" - Average Precision@{k}: {scores['precision']:.2f}")
#         print(f" - Average Recall@{k}: {scores['recall']:.2f}")
#         print(f" - Average MRR@{k}: {scores['mrr']:.2f}")


Processing Batches:  48%|████▊     | 24/50 [16:16<17:21, 40.04s/it]

In [None]:
# from sentence_transformers import CrossEncoder
# import pandas as pd
# import random

# # Load the cleaned datasets
# train_df = pd.read_csv('enhanced_stratified_sample_train.csv')
# val_df = pd.read_csv('cleaned_val_df.csv')

# # Load a pre-trained Cross-Encoder
# cross_encoder = CrossEncoder('cross-encoder/stsb-roberta-base')

# def generate_annotated_pairs(df, num_pairs=10000, threshold=0.7):
#     pairs = []
#     for _ in range(num_pairs):
#         idx1, idx2 = random.sample(range(len(df)), 2)
#         pairs.append((df.iloc[idx1]['cleaned_abstract'], df.iloc[idx2]['cleaned_abstract']))

#     # Compute similarity scores
#     scores = cross_encoder.predict(pairs)

#     # Create annotated pairs with labels
#     annotated_pairs = []
#     for (abstract1, abstract2), score in zip(pairs, scores):
#         label = 1 if score >= threshold else 0
#         annotated_pairs.append((abstract1, abstract2, label))

#     return pd.DataFrame(annotated_pairs, columns=['abstract1', 'abstract2', 'label'])

# # Generate pairs for training and validation
# train_pairs = generate_annotated_pairs(train_df, num_pairs=10000)
# val_pairs = generate_annotated_pairs(val_df, num_pairs=2000)

# # Save the pairs to CSV
# train_pairs.to_csv('train_pairs.csv', index=False)
# val_pairs.to_csv('val_pairs.csv', index=False)


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
# import pandas as pd
# from datasets import Dataset
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
# import torch

# # Load the annotated pairs
# train_pairs = pd.read_csv('train_pairs.csv')
# val_pairs = pd.read_csv('val_pairs.csv')

# # Prepare datasets
# def prepare_dataset(pairs):
#     return Dataset.from_pandas(pairs[['abstract1', 'abstract2', 'label']])

# train_dataset = prepare_dataset(train_pairs)
# val_dataset = prepare_dataset(val_pairs)

# # Display sample
# print(train_dataset[0])


{'abstract1': 'measurement cosmological parameter investigated representation leastaction method us redshiftspace dataset simultaneously constrain realspace field deltab v method robust recovering entire evolution matter density contrast peculiar velocity galaxy real space current galaxy redshift survey main strength method permit u break degeneracy parameter b omegam customarily measured ratio betaequiv omegam06b redshiftspace distortion evaluated current context separately procedure provides simple numerical mean extract much information possible given sample simplest linear bias model resorting cosmic complementarity resolve degeneracy measurement omegam premise applies sophisticated choice bias model construct likelihood parameter lambdabomegam evaluate relative likelihood different value b omegam method applied ira redshift survey lowresolution gaussian smoothing length 1200 km within spherical region xrm max sim 15000 km reconstructed velocity field compared potentreconstructed v

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# def tokenize_function(examples):
#     # Tokenize the inputs
#     tokenized = tokenizer(examples["abstract1"], examples["abstract2"], padding="max_length", truncation=True)

#     # Cast labels to floats
#     tokenized['labels'] = [float(label) for label in examples['label']]
#     return tokenized

# # Tokenize the datasets
# train_tokenized = train_dataset.map(tokenize_function, batched=True)
# val_tokenized = val_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# from transformers import AutoModelForSequenceClassification

# # Load the model with dropout
# model = AutoModelForSequenceClassification.from_pretrained(
#     "sentence-transformers/all-MiniLM-L6-v2",
#     num_labels=1,  # Regression task
#     hidden_dropout_prob=0.3,  # Dropout regularization
#     attention_probs_dropout_prob=0.3
# )

# # Move model to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.3, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-1

In [None]:
# training_args = TrainingArguments(
#     output_dir="./fine_tuned_all_minilm",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     logging_dir="./logs",
#     logging_steps=50,
#     num_train_epochs=10,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     metric_for_best_model="loss",
#     save_total_limit=2,  # Keep the last 2 checkpoints
#     report_to="tensorboard"  # Log to TensorBoard
# )

# # Early stopping callback
# early_stopping_callback = EarlyStoppingCallback(
#     early_stopping_patience=3,  # Stop if no improvement after 3 evaluations
#     early_stopping_threshold=0.01  # Minimum improvement threshold
# )




In [None]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_tokenized,
#     eval_dataset=val_tokenized,
#     # compute_metrics=compute_metrics,
#     callbacks=[early_stopping_callback]
# )

# # Start training
# trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0733,0.077487
2,0.0788,0.07214
3,0.0607,0.092006
4,0.0574,0.081104


TrainOutput(global_step=2500, training_loss=0.0698766222000122, metrics={'train_runtime': 770.5921, 'train_samples_per_second': 129.77, 'train_steps_per_second': 8.111, 'total_flos': 1326585077760000.0, 'train_loss': 0.0698766222000122, 'epoch': 4.0})

In [None]:
# trainer.save_model("./fine_tuned_all_minilm")
# tokenizer.save_pretrained("./fine_tuned_all_minilm")


('./fine_tuned_all_minilm/tokenizer_config.json',
 './fine_tuned_all_minilm/special_tokens_map.json',
 './fine_tuned_all_minilm/vocab.txt',
 './fine_tuned_all_minilm/added_tokens.json',
 './fine_tuned_all_minilm/tokenizer.json')

In [None]:
# metrics = trainer.evaluate()
# print(metrics)


{'eval_loss': 0.07214009016752243, 'eval_runtime': 13.6419, 'eval_samples_per_second': 146.607, 'eval_steps_per_second': 9.163, 'epoch': 4.0}


In [None]:
# import pandas as pd
# import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer

# # Download NLTK resources if not already downloaded
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# # Load the stratified sample dataset
# df = pd.read_csv('test_df.csv')

# # Initialize stopwords and lemmatizer
# stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()

# # Function to clean text
# def clean_text(text):
#     if pd.isnull(text):
#         return ""
#     # Lowercasing
#     text = text.lower()
#     # Remove special characters and punctuation
#     text = re.sub(r'[^a-z0-9\s]', '', text)
#     # Remove extra whitespace and newlines
#     text = re.sub(r'\s+', ' ', text).strip()
#     # Tokenize and remove stopwords, then lemmatize
#     tokens = text.split()
#     tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
#     return ' '.join(tokens)


# df['cleaned_abstract'] = df['abstract'].apply(clean_text)


# test_df = df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# from transformers import AutoModel, AutoTokenizer
# import numpy as np
# import torch # Import torch

# # Load the fine-tuned model and tokenizer
# model = AutoModel.from_pretrained("./fine_tuned_all_minilm")
# tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_all_minilm")

# # Move model to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Define device
# model.to(device) # Move model to device

# def generate_embeddings(texts, model, tokenizer, batch_size=16):
#     model.eval()
#     embeddings = []
#     for i in range(0, len(texts), batch_size):
#         batch = texts[i:i + batch_size]
#         inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
#         with torch.no_grad():
#             outputs = model(**inputs)
#             batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
#             embeddings.append(batch_embeddings)
#     return np.vstack(embeddings)

# # Load the test dataset
# train_df = pd.read_csv('enhanced_stratified_sample_train.csv')
# test_abstracts = train_df['enhanced_text'].tolist()

# # Generate embeddings
# test_embeddings = generate_embeddings(test_abstracts, model, tokenizer)

# # Save the embeddings
# np.save('fine_tuned_test_embeddings.npy', test_embeddings)
# print("Embeddings generated and saved successfully!")

Embeddings generated and saved successfully!


In [None]:
# from transformers import AutoModel, AutoTokenizer
# import numpy as np
# import torch # Import torch

# # Load the fine-tuned model and tokenizer
# model = AutoModel.from_pretrained("./fine_tuned_all_minilm")
# tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_all_minilm")

# # Move model to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Define device
# model.to(device) # Move model to device

# def generate_embeddings(texts, model, tokenizer, batch_size=16):
#     model.eval()
#     embeddings = []
#     for i in range(0, len(texts), batch_size):
#         batch = texts[i:i + batch_size]
#         inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
#         with torch.no_grad():
#             outputs = model(**inputs)
#             batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
#             embeddings.append(batch_embeddings)
#     return np.vstack(embeddings)

# # Load the test dataset
# test_abstracts = test_df['cleaned_abstract'].tolist()

# # Generate embeddings
# test_embeddings = generate_embeddings(test_abstracts, model, tokenizer)

# # Save the embeddings
# np.save('fine_tuned_testset_embeddings.npy', test_embeddings)
# print("Embeddings generated and saved successfully!")

Embeddings generated and saved successfully!


In [None]:
# import pandas as pd
# import numpy as np
# from tqdm import tqdm
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.cluster import KMeans
# from datetime import datetime
# import lancedb

# # Load the SBERT model
# model = SentenceTransformer("all-MiniLM-L6-v2")

# # Connect to LanceDB and open the table
# db = lancedb.connect("lancedb_directory")
# table = db.open_table("enhanced_papers")

# # Load the training data from CSV (for metadata like categories)
# train_df = pd.read_csv("train_df.csv")

# # Perform clustering on train embeddings for clustering-based ground truth
# train_embeddings = np.load("fine_tuned_test_embeddings.npy")
# # Load the precomputed test embeddings
# test_embeddings = np.load("fine_tuned_test_embeddings.npy")
# print(f"Loaded test embeddings shape: {test_embeddings.shape}")

# num_clusters = 20  # Adjust the number of clusters as needed
# kmeans = KMeans(n_clusters=num_clusters, random_state=42)
# train_df['cluster'] = kmeans.fit_predict(train_embeddings)

# # Define temporal evaluation parameters
# time_window_days = 365  # 1-year window for temporal evaluation

# # Similarity threshold
# similarity_threshold = 0.7

# # Weights for hybrid scoring
# weights = {"category": 0.3, "cluster": 0.2, "similarity": 0.3, "temporal": 0.2}

# # Precompute category counts in the training data
# category_counts = train_df["categories"].value_counts().to_dict()

# # Initialize metrics for each ground truth method
# metrics = {
#     "category": {"precision": 0, "recall": 0, "mrr": 0},
#     "clustering": {"precision": 0, "recall": 0, "mrr": 0},
#     "similarity": {"precision": 0, "recall": 0, "mrr": 0},
#     "temporal": {"precision": 0, "recall": 0, "mrr": 0},
#     "hybrid": {"precision": 0, "recall": 0, "mrr": 0},
# }

# # Number of queries to process
# num_queries = len(test_df)

# # Define batch size
# batch_size = 100

# # Evaluation functions
# def precision_at_k(recommendations, true_label, k=5):
#     relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
#     return relevant / k

# def recall_at_k(recommendations, true_label, all_relevant_count, k=5):
#     relevant = sum(1 for label in recommendations["categories"].tolist()[:k] if label == true_label)
#     return relevant / all_relevant_count if all_relevant_count > 0 else 0

# def mean_reciprocal_rank(recommendations, true_label):
#     for i, label in enumerate(recommendations["categories"].tolist()):
#         if label == true_label:
#             return 1 / (i + 1)
#     return 0

# def temporal_score(query_date, rec_date, window=time_window_days):
#     rec_date = pd.to_datetime(rec_date, errors='coerce')
#     return 1 if pd.notnull(rec_date) and abs((query_date - rec_date).days) <= window else 0

# # Process queries in batches
# for start in tqdm(range(0, num_queries, batch_size), desc="Processing Batches"):
#     end = min(start + batch_size, num_queries)

#     # Extract batch queries, true categories, and update dates
#     batch_queries = test_df.iloc[start:end]["cleaned_abstract"].tolist()
#     batch_categories = test_df.iloc[start:end]["categories"].values
#     batch_dates = pd.to_datetime(test_df.iloc[start:end]["update_date"], errors='coerce').values

#     # Generate embeddings for the batch of queries
#     # query_embeddings = model.encode(batch_queries, batch_size=32, show_progress_bar=False)
#     # Extract the batch of precomputed embeddings
#     batch_embeddings = test_embeddings[start:end]
#     for idx, (query_embedding, true_category, query_date) in enumerate(zip(batch_embeddings, batch_categories, batch_dates)):
#         # Perform similarity search in LanceDB
#         recommendations = table.search(query_embedding).metric("cosine").limit(5).to_pandas()
#         recommendation_vectors = np.vstack(recommendations['embedding'].tolist())

#         # CATEGORY-BASED EVALUATION
#         all_relevant_count = category_counts.get(true_category, 0)
#         metrics["category"]["precision"] += precision_at_k(recommendations, true_category, k=5)
#         metrics["category"]["recall"] += recall_at_k(recommendations, true_category, all_relevant_count, k=5)
#         metrics["category"]["mrr"] += mean_reciprocal_rank(recommendations, true_category)

#         # CLUSTERING-BASED EVALUATION
#         true_cluster = train_df[train_df["categories"] == true_category]["cluster"].iloc[0]
#         predicted_clusters = recommendations["categories"].map(lambda cat: train_df[train_df["categories"] == cat]["cluster"].iloc[0])
#         relevant_clusters = sum(1 for cluster in predicted_clusters[:5] if cluster == true_cluster)
#         metrics["clustering"]["precision"] += relevant_clusters / 5
#         metrics["clustering"]["recall"] += relevant_clusters / all_relevant_count if all_relevant_count > 0 else 0
#         metrics["clustering"]["mrr"] += mean_reciprocal_rank(recommendations, true_cluster)

#         # SIMILARITY THRESHOLD-BASED EVALUATION
#         cosine_similarities = cosine_similarity(query_embedding.reshape(1, -1), recommendation_vectors)[0]
#         relevant_similar = sum(1 for score in cosine_similarities if score >= similarity_threshold)
#         metrics["similarity"]["precision"] += relevant_similar / 5
#         metrics["similarity"]["recall"] += relevant_similar / all_relevant_count if all_relevant_count > 0 else 0
#         metrics["similarity"]["mrr"] += mean_reciprocal_rank(recommendations, true_category)

#         # TEMPORAL-BASED EVALUATION
#         relevant_temporal = sum(1 for rec_date in recommendations["update_date"] if temporal_score(query_date, rec_date))
#         metrics["temporal"]["precision"] += relevant_temporal / 5
#         metrics["temporal"]["recall"] += relevant_temporal / all_relevant_count if all_relevant_count > 0 else 0
#         metrics["temporal"]["mrr"] += mean_reciprocal_rank(recommendations, true_category)

#         # HYBRID SCORING (weighted combination)
#         hybrid_score = (
#             weights["category"] * precision_at_k(recommendations, true_category, k=5) +
#             weights["cluster"] * (relevant_clusters / 5) +
#             weights["similarity"] * (relevant_similar / 5) +
#             weights["temporal"] * (relevant_temporal / 5)
#         )
#         metrics["hybrid"]["precision"] += hybrid_score
#         metrics["hybrid"]["recall"] += hybrid_score
#         metrics["hybrid"]["mrr"] += hybrid_score

# # Compute average metrics
# for method in metrics:
#     metrics[method]["precision"] /= num_queries
#     metrics[method]["recall"] /= num_queries
#     metrics[method]["mrr"] /= num_queries

# # Display final metrics
# print("\nFinal Metrics:")
# for method, scores in metrics.items():
#     print(f"\n{method.capitalize()} Ground Truth:")
#     print(f" - Average Precision@5: {scores['precision']:.2f}")
#     print(f" - Average Recall@5: {scores['recall']:.2f}")
#     print(f" - Average MRR: {scores['mrr']:.2f}")


Loaded test embeddings shape: (70000, 384)


Processing Batches: 100%|██████████| 150/150 [27:00<00:00, 10.80s/it]


Final Metrics:

Category Ground Truth:
 - Average Precision@5: 0.02
 - Average Recall@5: 0.00
 - Average MRR: 0.03

Clustering Ground Truth:
 - Average Precision@5: 0.07
 - Average Recall@5: 0.00
 - Average MRR: 0.00

Similarity Ground Truth:
 - Average Precision@5: 0.00
 - Average Recall@5: 0.00
 - Average MRR: 0.03

Temporal Ground Truth:
 - Average Precision@5: 0.12
 - Average Recall@5: 0.00
 - Average MRR: 0.03

Hybrid Ground Truth:
 - Average Precision@5: 0.04
 - Average Recall@5: 0.04
 - Average MRR: 0.04



