# Installing necessary packages

In [6]:
!pip install lancedb
!pip install sentence_transformers
!pip install datasets
!pip install streamlit
!npm install localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K
up to date, audited 23 packages in 767ms
[1G[0K⠴[1G[0K
[1G[0K⠴[1G[0K3 packages are looking for funding
[1G[0K⠴[1G[0K  run `npm fund` for details
[1G[0K⠴[1G[0K
2 [33m[1mmoderate[22m[39m severity vulnerabilities

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.
[1G[0K⠴[1G[0K

In [7]:
!wget -q -O - ipv4.icanhazip.com

104.198.105.23


# Mounting Drive to access the JSON dataset file


In [8]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import dask.bag as db
import json
data = db.read_text('/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json').map(json.loads)

In [10]:
data.take(1)

({'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

# Building a Stratified Sample of 100K records based on the category in the dataset

In [11]:
import pandas as pd
import json
from collections import Counter
from sklearn.model_selection import train_test_split

# Define the file path
file_name = '/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json'

# Define the columns to extract
cols = ['id', 'title', 'abstract', 'categories', 'authors', 'comments', 'update_date']

# Initialize a Counter to store unique categories
category_counter = Counter()

# Load data and extract relevant fields + count categories
data = []
with open(file_name, encoding='latin-1') as f:
    for line in f:
        doc = json.loads(line)
        categories = doc.get('categories', '').strip()
        if categories and ' ' not in categories:  # Include only rows with a single category
            category_counter.update([categories])
            data.append([
                doc.get('id'),
                doc.get('title', ''),
                doc.get('abstract', ''),
                categories,
                doc.get('authors', ''),
                doc.get('comments', ''),
                doc.get('update_date', '')
            ])

# Print unique categories
unique_categories = list(category_counter.keys())
print(f"Total unique categories: {len(unique_categories)}")
print("Unique categories:")
print(unique_categories)

# Optionally, print the top 20 most common categories with their counts
print("\nTop 20 categories by frequency:")
for category, count in category_counter.most_common(20):
    print(f"{category}: {count}")

# Convert data to DataFrame
df = pd.DataFrame(data, columns=cols)

# Clean the DataFrame
# df['abstract'] = df['abstract'].str.strip().str.lower()
# df['title'] = df['title'].str.strip().str.lower()
# df['categories'] = df['categories'].str.strip()
# df['authors'] = df['authors'].str.strip()
# df['comments'] = df['comments'].str.strip()
df['update_date'] = pd.to_datetime(df['update_date'], errors='coerce')

# Drop rows with missing abstracts or titles
df = df.dropna(subset=['abstract', 'title'])

# Filter out categories with fewer than 10 samples
category_counts = df['categories'].value_counts()
valid_categories = category_counts[category_counts >= 50].index
print(f"\nTotal valid categories: {len(valid_categories)}")

# Filter the DataFrame to include only valid categories
df_filtered = df[df['categories'].isin(valid_categories)]

# Stratified sampling based on categories
if len(df_filtered) < 100000:
    raise ValueError(f"Not enough data to sample 100,000 rows. Available: {len(df_filtered)}")

stratified_sample, _ = train_test_split(
    df_filtered,
    train_size=100000,
    stratify=df_filtered['categories'],
    random_state=62
)

# Reset index
stratified_sample = stratified_sample.reset_index(drop=True)

# Save the final sample to a CSV file
stratified_sample.to_csv('stratified_sample.csv', index=False)

# Display summary
print(f"\nFinal dataset size: {len(stratified_sample)}")
print("Category distribution in the sample:")
print(stratified_sample['categories'].value_counts())


Total unique categories: 149
Unique categories:
['hep-ph', 'physics.gen-ph', 'math.CO', 'cond-mat.mes-hall', 'gr-qc', 'cond-mat.mtrl-sci', 'astro-ph', 'math.NT', 'hep-th', 'hep-ex', 'math.NA', 'nlin.PS', 'math.RA', 'cond-mat.str-el', 'physics.pop-ph', 'nucl-th', 'math.FA', 'cs.DS', 'math.DS', 'physics.soc-ph', 'math.AG', 'math.OA', 'math.PR', 'math.DG', 'physics.optics', 'math.GR', 'nlin.SI', 'math.SG', 'physics.data-an', 'cs.CC', 'math.GT', 'quant-ph', 'cond-mat.other', 'math.CV', 'math.AP', 'cond-mat.supr-con', 'math.RT', 'cond-mat.stat-mech', 'q-bio.OT', 'physics.plasm-ph', 'nlin.CG', 'nucl-ex', 'cond-mat.soft', 'physics.comp-ph', 'math.MG', 'math.QA', 'physics.bio-ph', 'physics.chem-ph', 'math.AT', 'physics.geo-ph', 'q-bio.BM', 'math.OC', 'cs.CR', 'physics.class-ph', 'q-bio.PE', 'q-bio.NC', 'physics.atom-ph', 'math.GM', 'hep-lat', 'math.CA', 'physics.atm-clus', 'cs.PF', 'physics.acc-ph', 'math.SP', 'nlin.CD', 'physics.hist-ph', 'physics.flu-dyn', 'cond-mat.dis-nn', 'cs.CV', 'cs.LG'

# Filtering the sample further to exclude categories with unique counts less than 20 to maintain a stratified distribution throughout

In [12]:
import pandas as pd

# Load your DataFrame
df = pd.read_csv('stratified_sample.csv')

# Get category counts
category_counts = df['categories'].value_counts()

# Identify categories with 20 or more occurrences
valid_categories = category_counts[category_counts >= 20].index

# Filter DataFrame to keep only rows with valid categories
df_filtered = df[df['categories'].isin(valid_categories)]

# Display the number of rows before and after filtering
print(f"Original dataset size: {len(df)}")
print(f"Filtered dataset size: {len(df_filtered)}")

# Save the filtered DataFrame to a new CSV file
df_filtered.to_csv('filtered_stratified_sample.csv', index=False)

# Display summary
print("Categories with fewer than 20 records have been removed.")
print("Updated category distribution:")
print(df_filtered['categories'].value_counts())


Original dataset size: 100000
Filtered dataset size: 99947
Categories with fewer than 20 records have been removed.
Updated category distribution:
categories
astro-ph    6277
hep-ph      5934
quant-ph    5159
cs.CV       4629
hep-th      4301
            ... 
cs.SD         27
cs.NA         26
cs.MS         26
q-bio.CB      25
cs.OS         23
Name: count, Length: 145, dtype: int64


# Splitting the sample into Train(70K), Validation(15K), Test(15K)

In [13]:
from sklearn.model_selection import train_test_split

# Load the stratified sample
df = df_filtered

# Split into train (70k) and remaining (30k)
train_df, remaining_df = train_test_split(
    df,
    train_size=70000,
    stratify=df['categories'],
    random_state=42
)

# Split remaining into validation (15k) and test (15k)
val_df, test_df = train_test_split(
    remaining_df,
    test_size=0.5,
    stratify=remaining_df['categories'],
    random_state=42
)

# Save the splits to CSV files
train_df.to_csv('train_df.csv', index=False)
val_df.to_csv('val_df.csv', index=False)
test_df.to_csv('test_df.csv', index=False)

# Display summary
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")


Train set size: 70000
Validation set size: 14973
Test set size: 14974


# Preprocessing data by performing necessary cleaning operations (Lowercasing, Lemmatizing, Removing punctuations, whitespace, special characters)

In [14]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the stratified sample dataset
df = pd.read_csv('train_df.csv')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning to relevant fields
df['cleaned_title'] = df['title'].apply(clean_text)
df['cleaned_authors'] = df['authors'].apply(clean_text)
df['cleaned_categories'] = df['categories'].apply(clean_text)
df['cleaned_abstract'] = df['abstract'].apply(clean_text)
df['cleaned_comments'] = df['comments'].apply(clean_text)

# Create the enhanced text field and remove newlines
df['enhanced_text'] = df.apply(lambda row: f"""
Title: {row['cleaned_title']} [SEP]
Authors: {row['cleaned_authors']} [SEP]
Categories: {row['cleaned_categories']} [SEP]
Abstract: {row['cleaned_abstract']} [SEP]
Comments: {row['cleaned_comments']} [SEP]
Updated on: {row['update_date']}
""".replace('\n', ' ').strip(), axis=1)

# Display the first few rows to verify the enhanced text field
print(df[['id', 'enhanced_text']].head())

# Save the updated DataFrame to a new CSV file
df.to_csv('enhanced_stratified_sample_train.csv', index=False)

print("Enhanced text field created and saved successfully.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                 id                                      enhanced_text
0        2007.12657  Title: sublimative evolution 486958 arrokoth [...
1         1208.4774  Title: torii phase [SEP] Authors: emmanuel ami...
2         0903.4882  Title: kinetic monte carlo simulation strained...
3        1601.06809  Title: test field cannot destroy extremal blac...
4  astro-ph/0104478  Title: low albedo among extinct comet candidat...
Enhanced text field created and saved successfully.


# Creating the schema for storing the sentence transformer embeddings in LanceDB (all_miniLM_L6_v2)

In [15]:
import pandas as pd
import lancedb
from sentence_transformers import SentenceTransformer
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

# Load the enhanced dataset
df = pd.read_csv("enhanced_stratified_sample_train.csv")

# Display the first few rows to verify
print(df.head())

# Load the Sentence-Transformer model
model_name = "all-MiniLM-L6-v2"  # You can adjust the model based on your use case
db = lancedb.connect("lancedb_directory")

# Register the embedding function
registry = get_registry()
embedding_function = registry.get("sentence-transformers").create(
    name=model_name,
    device="cuda"  # Use "cuda" for GPU; use "cpu" if GPU is not available
)

# Define the LanceDB schema with Pydantic
class TextData(LanceModel):
    id: str
    title: str
    authors: str
    abstract: str
    categories: str
    comments: str
    update_date: str
    enhanced_text: str = embedding_function.SourceField()  # Source text for embeddings
    embedding: Vector(embedding_function.ndims()) = embedding_function.VectorField()

# Create the table (overwrite if it exists)
table = db.create_table("enhanced_papers_pretrained_1", schema=TextData, mode="overwrite")

# Convert the DataFrame to a list of dictionaries
data = df[["id", "title", "authors","abstract", "categories", "comments", "update_date", "enhanced_text"]].astype(str).to_dict(orient="records")

# Add data to the table
table.add(data)

print("Enhanced data added to the LanceDB table successfully!")


KeyboardInterrupt: 

# Creating the schema for storing the sentence transformer embeddings in LanceDB (allenai_specter)

In [12]:
import pandas as pd
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

# Load the enhanced dataset
df = pd.read_csv("enhanced_stratified_sample_train.csv")

# Display the first few rows to verify
print(df.head())

# Load the Sentence-Transformer model
model_name = "allenai-specter"  # You can adjust the model based on your use case
db = lancedb.connect("lancedb_directory")

# Register the embedding function
registry = get_registry()
embedding_function = registry.get("sentence-transformers").create(
    name=model_name,
    device="cuda"  # Use "cuda" for GPU; use "cpu" if GPU is not available
)

# Define the LanceDB schema with Pydantic
class TextData(LanceModel):
    id: str
    title: str
    authors: str
    abstract: str
    categories: str
    comments: str
    update_date: str
    enhanced_text: str = embedding_function.SourceField()  # Source text for embeddings
    embedding: Vector(embedding_function.ndims()) = embedding_function.VectorField()

# Create the table (overwrite if it exists)
table = db.create_table("enhanced_papers_pretrained_2", schema=TextData, mode="overwrite")

# Convert the DataFrame to a list of dictionaries
data = df[["id", "title", "authors", "abstract","categories", "comments", "update_date", "enhanced_text"]].astype(str).to_dict(orient="records")

# Add data to the table
table.add(data)

print("Enhanced data added to the LanceDB table successfully!")


                 id                                              title  \
0        2007.12657     The Sublimative Evolution of (486958) Arrokoth   
1         1208.4774                                The Torii of phases   
2         0903.4882  Kinetic Monte Carlo Simulation of Strained Het...   
3        1601.06809    Test fields cannot destroy extremal black holes   
4  astro-ph/0104478         Low Albedos Among Extinct Comet Candidates   

                                            abstract         categories  \
0    We consider the history of New Horizons targ...        astro-ph.EP   
1    The import of the magnitude of fourier coeff...            math.HO   
2    An efficient method for the simulation of st...  cond-mat.mtrl-sci   
3    We prove that (possibly charged) test fields...              gr-qc   
4    We present radiometric effective radii and v...           astro-ph   

                                             authors  \
0  Jordan K. Steckloff, Carey M. Lisse, Taylor K

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/331 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/462k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Enhanced data added to the LanceDB table successfully!


# Finetuning the all_miniLM_L6_v2 model


In [16]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the validation dataset
val_df = pd.read_csv('val_df.csv')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Clean the 'abstract' field
val_df['cleaned_abstract'] = val_df['abstract'].apply(clean_text)

# Save the cleaned validation data
val_df.to_csv('cleaned_val_df.csv', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [17]:
import pandas as pd
import random
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments # Import AutoModelForSequenceClassification
from sentence_transformers import losses
from torch.utils.data import DataLoader

# Load the dataset
file_path = "enhanced_stratified_sample_train.csv"
df = pd.read_csv(file_path)

# Extract abstracts and categories
df = df.dropna(subset=['abstract', 'categories'])
val_df = pd.read_csv('cleaned_val_df.csv')
# Function to generate annotated pairs based on categories
def generate_pairs_with_categories(df, num_pairs=10000, positive_ratio=0.5):
    pairs = []
    num_positive = int(num_pairs * positive_ratio)
    num_negative = num_pairs - num_positive

    # Group abstracts by category
    category_groups = df.groupby('categories')['abstract'].apply(list).to_dict()

    # Positive pairs (same category)
    for _ in range(num_positive):
        category = random.choice(list(category_groups.keys()))
        abstracts_in_category = category_groups[category]
        if len(abstracts_in_category) > 1:
            abstract1, abstract2 = random.sample(abstracts_in_category, 2)
            pairs.append({"sentence1": abstract1, "sentence2": abstract2, "label": 1})

    # Negative pairs (different categories)
    categories = list(category_groups.keys())
    for _ in range(num_negative):
        category1, category2 = random.sample(categories, 2)
        abstract1 = random.choice(category_groups[category1])
        abstract2 = random.choice(category_groups[category2])
        pairs.append({"sentence1": abstract1, "sentence2": abstract2, "label": 0})

    return pairs

# Generate annotated pairs using categories
pairs = generate_pairs_with_categories(df, num_pairs=10000, positive_ratio=0.5)
valpairs = generate_pairs_with_categories(val_df, num_pairs=3000, positive_ratio=0.5)
# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_list(pairs)
val_dataset = Dataset.from_list(valpairs)
# Tokenizer and model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load the model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # Assuming binary classification (0 or 1)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=512)

hf_dataset = hf_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
# Define training arguments
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./finetuned_all_minilm_l6_v2")
tokenizer.save_pretrained("./finetuned_all_minilm_l6_v2")

print("Model fine-tuned and saved successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,0.418,0.518349


Model fine-tuned and saved successfully!


In [18]:
from sentence_transformers import SentenceTransformer

# Wrap the model in SentenceTransformer and save it
sentence_model = SentenceTransformer(model_name)
sentence_model._modules['0'].auto_model = model  # Replace the underlying AutoModel with the fine-tuned model
sentence_model.save("./finetuned_all_minilm_l6_v2")

In [19]:
import shutil
# Destination folder in Google Drive (replace 'MyDrive' and folder name as needed)
drive_destination = '/content/drive/MyDrive/finetuned_all_minilm_l6_v2'
# Source folder on Colab (replace with your actual folder name)
colab_folder = '/content/finetuned_all_minilm_l6_v2'
# Copy the folder to Google Drive
shutil.copytree(colab_folder, drive_destination)

print(f"Folder '{colab_folder}' has been successfully copied to '{drive_destination}'")

Folder '/content/finetuned_all_minilm_l6_v2' has been successfully copied to '/content/drive/MyDrive/finetuned_all_minilm_l6_v2'


# Cleaning & preprocessing the test data

In [16]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the stratified sample dataset
df = pd.read_csv('test_df.csv')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


df['cleaned_abstract'] = df['abstract'].apply(clean_text)


test_df = df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Generating new embeddings on train data using the finetuned model

In [None]:
# import numpy as np
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# from tqdm import tqdm

# # Load the fine-tuned model
# model = SentenceTransformer("./finetuned_all_minilm_l6_v2")

# # Load the training data
# train_df = pd.read_csv("/content/enhanced_stratified_sample_train.csv")

# # Ensure there are no missing abstracts
# train_df = train_df.dropna(subset=["enhanced_text"])

# # Generate embeddings for each abstract in the training data
# train_embeddings = []
# for abstract in tqdm(train_df["enhanced_text"].tolist(), desc="Generating Train Embeddings"):
#     embedding = model.encode(abstract, show_progress_bar=False)
#     train_embeddings.append(embedding)

# # Convert embeddings to a NumPy array
# train_embeddings = np.array(train_embeddings)

# # Save the embeddings to a .npy file
# np.save("train_embeddings.npy", train_embeddings)

# print("Train embeddings generated and saved successfully as 'train_embeddings.npy'!")


# Storing new embeddings in LanceDB


In [17]:
import pandas as pd
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

# Load the enhanced dataset
df = pd.read_csv("enhanced_stratified_sample_train.csv")

# Display the first few rows to verify
print(df.head())

# Load the Sentence-Transformer model
model_name = "./finetuned_all_minilm_l6_v2"  # You can adjust the model based on your use case
db = lancedb.connect("lancedb_directory")

# Register the embedding function
registry = get_registry()
embedding_function = registry.get("sentence-transformers").create(
    name=model_name,
    device="cuda"  # Use "cuda" for GPU; use "cpu" if GPU is not available
)

# Define the LanceDB schema with Pydantic
class TextData(LanceModel):
    id: str
    title: str
    authors: str
    abstract: str
    categories: str
    comments: str
    update_date: str
    enhanced_text: str = embedding_function.SourceField()  # Source text for embeddings
    embedding: Vector(embedding_function.ndims()) = embedding_function.VectorField()

# Create the table (overwrite if it exists)
table = db.create_table("enhanced_papers_finetuned", schema=TextData, mode="overwrite")

# Convert the DataFrame to a list of dictionaries
data = df[["id", "title", "authors", "abstract","categories", "comments", "update_date", "enhanced_text"]].astype(str).to_dict(orient="records")

# Add data to the table
table.add(data)

print("Enhanced data added to the LanceDB table successfully!")


                 id                                              title  \
0        2007.12657     The Sublimative Evolution of (486958) Arrokoth   
1         1208.4774                                The Torii of phases   
2         0903.4882  Kinetic Monte Carlo Simulation of Strained Het...   
3        1601.06809    Test fields cannot destroy extremal black holes   
4  astro-ph/0104478         Low Albedos Among Extinct Comet Candidates   

                                            abstract         categories  \
0    We consider the history of New Horizons targ...        astro-ph.EP   
1    The import of the magnitude of fourier coeff...            math.HO   
2    An efficient method for the simulation of st...  cond-mat.mtrl-sci   
3    We prove that (possibly charged) test fields...              gr-qc   
4    We present radiometric effective radii and v...           astro-ph   

                                             authors  \
0  Jordan K. Steckloff, Carey M. Lisse, Taylor K

# Storing main LanceDB in Google Drive

In [18]:
import shutil
# Destination folder in Google Drive (replace 'MyDrive' and folder name as needed)
drive_destination = '/content/drive/MyDrive/lancedb_directory_main'
# Source folder on Colab (replace with your actual folder name)
colab_folder = '/content/lancedb_directory'
# Copy the folder to Google Drive
shutil.copytree(colab_folder, drive_destination)

print(f"Folder '{colab_folder}' has been successfully copied to '{drive_destination}'")

Folder '/content/lancedb_directory' has been successfully copied to '/content/drive/MyDrive/lancedb_directory_main'


In [None]:
# %%writefile app.py
# import streamlit as st
# import pandas as pd
# import numpy as np
# import torch
# from sentence_transformers import SentenceTransformer
# import lancedb
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
# import time

# # --------------------------- Load the LanceDB Table and Models --------------------------- #

# # Connect to LanceDB
# DB_PATH = "lancedb_directory"
# TABLE_NAME_1 = "enhanced_papers_pretrained_1"
# TABLE_NAME_2 = "enhanced_papers_pretrained_2"
# TABLE_NAME_3 = "enhanced_papers_finetuned"

# db = lancedb.connect(DB_PATH)
# table1 = db.open_table(TABLE_NAME_1)
# table2 = db.open_table(TABLE_NAME_2)
# table3 = db.open_table(TABLE_NAME_3)
# # Load the fine-tuned SentenceTransformer model
# embedding_model_1 = SentenceTransformer('all-MiniLM-L6-v2')
# embedding_model_2 = SentenceTransformer('allenai-specter')
# embedding_model_3 = SentenceTransformer('./finetuned_all_minilm_l6_v2')

# # Load the tokenizer and summarization model for RAG-based explanations
# MODEL_NAME = "google/flan-t5-large"  # Use a capable model like "google/flan-t5-large"
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# rag_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
# rag_pipeline = pipeline("text2text-generation", model=rag_model, tokenizer=tokenizer, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# # --------------------------- Streamlit UI Components --------------------------- #

# st.title("Research Paper Recommendation System with RAG-based Explanations")

# # Input abstract from the user
# user_abstract = st.text_area("Enter the abstract of your paper:", height=200)

# # Number of recommendations slider
# k = st.slider("Select the number of recommendations (k):", min_value=1, max_value=20, value=5)

# # Metadata filters
# st.sidebar.header("Filter Recommendations by Metadata")
# filter_category = st.sidebar.text_input("Filter by Category (optional):")
# filter_author = st.sidebar.text_input("Filter by Author (optional):")

# # --------------------------- Helper Functions --------------------------- #

# def generate_explanation(user_abstract, recommended_title, recommended_authors, recommended_abstract, max_input_length=512, max_output_length=200):
#     # Create an enhanced prompt with few-shot examples and clearer instructions
#     prompt = (
#         "You are an AI assistant tasked with explaining why a recommended research paper is relevant to a user's research.\n\n"

#         f"User's Research Abstract:\n{user_abstract}\n\n"
#         f"Recommended Paper:\n"
#         f"Title: {recommended_title}\n"
#         f"Authors: {recommended_authors}\n"
#         f"Abstract: {recommended_abstract}\n\n"
#         "Provide a detailed explanation of how the recommended paper is relevant to the user's research goals, methods, or findings."
#     )

#     # Generate explanation using the pipeline
#     try:
#         explanation = rag_pipeline(
#             prompt,
#             max_length=max_output_length,
#             min_length=50,
#             do_sample=True,
#             temperature=0.7,
#             top_p=0.9,
#             truncation=True
#         )[0]['generated_text']
#         return explanation
#     except Exception as e:
#         return f"Error during generation: {e}"

# def post_process_explanation(text):
#     # Remove any repetitive sentences or phrases
#     sentences = list(dict.fromkeys(text.split('. ')))
#     return '. '.join(sentences).strip()

# # --------------------------- Main Logic for Recommendations --------------------------- #

# # Submit button
# if st.button("Get Recommendations"):
#     if not user_abstract:
#         st.error("Please enter an abstract to proceed.")
#     else:
#         with st.spinner("Generating embedding for your abstract..."):
#             user_embedding = embedding_model.encode(user_abstract, convert_to_tensor=True).cpu().numpy()

#         # --------------------------- Perform Search in LanceDB --------------------------- #

#         # Perform similarity search
#         query = table.search(user_embedding).metric("cosine").limit(k)

#         # Apply metadata filters if provided
#         if filter_category:
#             query = query.where(f"categories == '{filter_category}'")
#         if filter_author:
#             query = query.where(f"authors LIKE '%{filter_author}%'")

#         # Get recommendations
#         recommendations = query.to_pandas()

#         if recommendations.empty:
#             st.warning("No recommendations found based on the current filters.")
#         else:
#             st.success(f"Top {len(recommendations)} Recommendations:")

#             # --------------------------- Display Recommendations with Explanations --------------------------- #

#             # Display recommendations with explanations
#             for idx, row in recommendations.iterrows():
#                 st.write(f"### {idx + 1}. {row['title']}")
#                 st.write(f"**Category:** {row['categories']}")
#                 st.write(f"**Authors:** {row['authors']}")
#                 st.write(f"**Abstract:** {row['abstract']}")
#                 st.write(f"**Last Updated:** {row['update_date']}")
#                 st.write("---")

#                 # Generate and display explanation
#                 explanation = generate_explanation(user_abstract, row['title'], row['authors'], row['abstract'])
#                 explanation = post_process_explanation(explanation)
#                 st.write(f"**Explanation:** {explanation}")
#                 st.write("---")


In [20]:
# %%writefile app.py
# import streamlit as st
# import pandas as pd
# import numpy as np
# import torch
# from sentence_transformers import SentenceTransformer
# import lancedb
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
# import time

# # --------------------------- Load the LanceDB Table and Models --------------------------- #

# # Connect to LanceDB
# DB_PATH = "lancedb_directory"
# TABLE_NAME_1 = "enhanced_papers_pretrained_1"
# TABLE_NAME_2 = "enhanced_papers_pretrained_2"
# TABLE_NAME_3 = "enhanced_papers_finetuned"

# db = lancedb.connect(DB_PATH)
# table1 = db.open_table(TABLE_NAME_1)
# table2 = db.open_table(TABLE_NAME_2)
# table3 = db.open_table(TABLE_NAME_3)

# # Load the SentenceTransformer models
# embedding_model_1 = SentenceTransformer('all-MiniLM-L6-v2')
# embedding_model_2 = SentenceTransformer('allenai-specter')
# embedding_model_3 = SentenceTransformer('./finetuned_all_minilm_l6_v2')

# # Load the tokenizer and summarization model for RAG-based explanations
# MODEL_NAME = "google/flan-t5-large"
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# rag_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
# rag_pipeline = pipeline("text2text-generation", model=rag_model, tokenizer=tokenizer, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# # --------------------------- Streamlit UI Components --------------------------- #

# st.title("Research Paper Recommendation System with RAG-based Explanations")

# # Input abstract from the user
# user_abstract = st.text_area("Enter the abstract of your paper:", height=200)

# # Number of recommendations slider
# k = st.slider("Select the number of recommendations (k):", min_value=1, max_value=20, value=5)

# # Fetch unique metadata values for filters
# def get_unique_values(table, column):
#     df = table.to_pandas()
#     return sorted(df[column].dropna().unique())

# categories1 = get_unique_values(table1, 'categories')
# categories2 = get_unique_values(table2, 'categories')
# categories3 = get_unique_values(table3, 'categories')
# categories = sorted(set(categories1 + categories2 + categories3))

# authors1 = get_unique_values(table1, 'authors')
# authors2 = get_unique_values(table2, 'authors')
# authors3 = get_unique_values(table3, 'authors')
# authors = sorted(set(authors1 + authors2 + authors3))

# # Metadata filters
# st.sidebar.header("Filter Recommendations by Metadata")
# filter_category = st.sidebar.selectbox("Filter by Category (optional):", [""] + categories)
# filter_author = st.sidebar.selectbox("Filter by Author (optional):", [""] + authors)

# # --------------------------- Helper Functions --------------------------- #

# def generate_explanation(user_abstract, recommended_title, recommended_authors, recommended_abstract, max_input_length=512, max_output_length=200):
#     prompt = (
#         "You are an AI assistant tasked with explaining why a recommended research paper is relevant to a user's research.\n\n"
#         f"User's Research Abstract:\n{user_abstract}\n\n"
#         f"Recommended Paper:\n"
#         f"Title: {recommended_title}\n"
#         f"Authors: {recommended_authors}\n"
#         f"Abstract: {recommended_abstract}\n\n"
#         "Provide a detailed explanation of how the recommended paper is relevant to the user's research goals, methods, or findings."
#     )
#     try:
#         explanation = rag_pipeline(
#             prompt,
#             max_length=max_output_length,
#             min_length=50,
#             do_sample=True,
#             temperature=0.7,
#             top_p=0.9,
#             truncation=True
#         )[0]['generated_text']
#         return explanation
#     except Exception as e:
#         return f"Error during generation: {e}"

# def post_process_explanation(text):
#     sentences = list(dict.fromkeys(text.split('. ')))
#     return '. '.join(sentences).strip()

# def get_recommendations(table, embedding_model, model_name):
#     with st.spinner(f"Generating embedding for your abstract using {model_name}..."):
#         user_embedding = embedding_model.encode(user_abstract, convert_to_tensor=True).cpu().numpy()

#     # Perform similarity search
#     query = table.search(user_embedding).metric("cosine").limit(k)

#     if filter_category:
#         query = query.where(f"categories == '{filter_category}'")
#     if filter_author:
#         query = query.where(f"authors LIKE '%{filter_author}%'")

#     return query.to_pandas()

# # --------------------------- Main Logic for Recommendations --------------------------- #

# if st.button("Get Recommendations"):
#     if not user_abstract:
#         st.error("Please enter an abstract to proceed.")
#     else:
#         models_and_tables = [
#             (embedding_model_1, table1, "all-MiniLM-L6-v2"),
#             (embedding_model_2, table2, "allenai-specter"),
#             (embedding_model_3, table3, "finetuned_all_minilm_l6_v2")
#         ]

#         for embedding_model, table, model_name in models_and_tables:
#             st.header(f"Recommendations using {model_name}")
#             recommendations = get_recommendations(table, embedding_model, model_name)

#             if recommendations.empty:
#                 st.warning(f"No recommendations found for {model_name} based on the current filters.")
#             else:
#                 st.success(f"Top {len(recommendations)} Recommendations from {model_name}:")

#                 for idx, row in recommendations.iterrows():
#                     st.write(f"### {idx + 1}. {row['title']}")
#                     st.write(f"**Category:** {row['categories']}")
#                     st.write(f"**Authors:** {row['authors']}")
#                     st.write(f"**Abstract:** {row['abstract']}")
#                     st.write(f"**Last Updated:** {row['update_date']}")
#                     st.write("---")

#                     explanation = generate_explanation(user_abstract, row['title'], row['authors'], row['abstract'])
#                     explanation = post_process_explanation(explanation)
#                     st.write(f"**Explanation:** {explanation}")
#                     st.write("---")


Writing app.py


In [1]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import lancedb
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import time
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# --------------------------- Load the LanceDB Table and Models --------------------------- #

# Connect to LanceDB
DB_PATH = "/content/drive/MyDrive/lancedb_directory_main"
TABLE_NAME_1 = "enhanced_papers_pretrained_1"
TABLE_NAME_2 = "enhanced_papers_pretrained_2"
TABLE_NAME_3 = "enhanced_papers_finetuned"

db = lancedb.connect(DB_PATH)
table1 = db.open_table(TABLE_NAME_1)
table2 = db.open_table(TABLE_NAME_2)
table3 = db.open_table(TABLE_NAME_3)

# Load the SentenceTransformer models
embedding_models = {
    "all-MiniLM-L6-v2": SentenceTransformer('all-MiniLM-L6-v2'),
    "allenai-specter": SentenceTransformer('allenai-specter'),
    "finetuned_all_minilm_l6_v2": SentenceTransformer('./finetuned_all_minilm_l6_v2')
}

model_tables = {
    "all-MiniLM-L6-v2": table1,
    "allenai-specter": table2,
    "finetuned_all_minilm_l6_v2": table3
}

# Load the tokenizer and summarization model for RAG-based explanations
MODEL_NAME = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
rag_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
rag_pipeline = pipeline("text2text-generation", model=rag_model, tokenizer=tokenizer, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# --------------------------- Streamlit UI Components --------------------------- #

st.title("Research Paper Recommendation System with RAG-based Explanations")

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize and remove stopwords, then lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Input abstract from the user
user_abstract = st.text_area("Enter the abstract of your paper:", height=200)

# Preprocess the user input abstract
user_abstract = clean_text(user_abstract)

# Number of recommendations slider
k = st.slider("Select the number of recommendations (k):", min_value=1, max_value=20, value=5)

# Model selection dropdown
selected_model_name = st.sidebar.selectbox("Select the embedding model:", list(embedding_models.keys()))

# Fetch unique metadata values for filters
def get_unique_values(table, column):
    df = table.to_pandas()
    return sorted(df[column].dropna().unique())

table = model_tables[selected_model_name]
categories = get_unique_values(table, 'categories')
authors = get_unique_values(table, 'authors')

# Metadata filters
st.sidebar.header("Filter Recommendations by Metadata")
filter_category = st.sidebar.selectbox("Filter by Category (optional):", [""] + categories)
filter_author = st.sidebar.selectbox("Filter by Author (optional):", [""] + authors)

# --------------------------- Helper Functions --------------------------- #

def generate_explanation(user_abstract, recommended_title, recommended_authors, recommended_abstract, max_input_length=512, max_output_length=200):
    prompt = (
        f"User's Input:\n{user_abstract}\n\n"
        f"Recommended Paper:\n"
        f"Title: {recommended_title}\n"
        f"Authors: {recommended_authors}\n"
        f"Abstract: {recommended_abstract}\n\n"
        "Explain briefly, how the recommended paper is relevant to the user's input"
    )
    try:
        explanation = rag_pipeline(
            prompt,
            max_length=max_output_length,
            min_length=50,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            truncation=True
        )[0]['generated_text']
        return explanation
    except Exception as e:
        return f"Error during generation: {e}"

def post_process_explanation(text):
    sentences = list(dict.fromkeys(text.split('. ')))
    return '. '.join(sentences).strip()

def get_recommendations(table, embedding_model, model_name):
    with st.spinner(f"Generating embedding for your abstract using {model_name}..."):
        user_embedding = embedding_model.encode(user_abstract, convert_to_tensor=True).cpu().numpy()

    # Perform similarity search
    query = table.search(user_embedding).metric("cosine").limit(k)

    if filter_category:
        query = query.where(f"categories == '{filter_category}'")
    if filter_author:
        query = query.where(f"authors LIKE '%{filter_author}%'")

    return query.to_pandas()

# --------------------------- Main Logic for Recommendations --------------------------- #

if st.button("Get Recommendations"):
    if not user_abstract:
        st.error("Please enter an abstract to proceed.")
    else:
        embedding_model = embedding_models[selected_model_name]
        table = model_tables[selected_model_name]

        st.header(f"Recommendations using {selected_model_name}")
        recommendations = get_recommendations(table, embedding_model, selected_model_name)

        if recommendations.empty:
            st.warning(f"No recommendations found for {selected_model_name} based on the current filters.")
        else:
            st.success(f"Top {len(recommendations)} Recommendations from {selected_model_name}:")

            for idx, row in recommendations.iterrows():
                st.write(f"### {idx + 1}. {row['title']}")
                st.write(f"**Category:** {row['categories']}")
                st.write(f"**Authors:** {row['authors']}")
                st.write(f"**Abstract:** {row['abstract']}")
                st.write(f"**Last Updated:** {row['update_date']}")
                st.write("---")

                explanation = generate_explanation(user_abstract, row['title'], row['authors'], row['abstract'])
                explanation = post_process_explanation(explanation)
                st.write(f"**Explanation:** {explanation}")
                st.write("---")


Overwriting app.py


In [None]:
!streamlit run app.py &>/content/logs.txt & npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0Kyour url is: https://forty-bushes-roll.loca.lt
