In [1]:
import os
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
from sqlalchemy import create_engine, text
import pandas as pd
import tarfile
import shutil
import re
import unicodedata
from tqdm import tqdm 

In [2]:
engine = create_engine(
    'postgresql+psycopg2://rg5073:rg5073pass@meta_data_postgres:5432/cleaned_meta_data_db',
    pool_size=10,
    max_overflow=0,
    pool_timeout=30,
)

In [9]:
query_preview = "SELECT * FROM arxiv_training LIMIT 5;"
preview = pd.read_sql(query_preview, engine)
print(" Preview of data:")
print(preview)

 Preview of data:
      paper_id  chunk_id     txt_filename  \
0  0801.4459v4         1  0801.4459v4.txt   
1  0801.1351v2         1  0801.1351v2.txt   
2  0801.1351v2         2  0801.1351v2.txt   
3  0801.3014v1         1  0801.3014v1.txt   
4  0801.0677v1         5  0801.0677v1.txt   

                                               query  \
0  What is the name of the genus 2 hyperelliptic ...   
1  What is the neotropy in the magnetic and elect...   
2                                               None   
3            What is the name of the f ur Physica D?   
4                                               None   

                                          chunk_data  
0  arXiv 0801.4459v4 math.NT 16 Mar 2010 INTEGRAL...  
1  arXiv 0801.1351v2 cond mat.str el 11 Jan 2008 ...  
2  ing a Laue Camera and were polished into recta...  
3  Physica D, in press revised version Genesis of...  
4  RJ . In we give, in pairwise normal form, solu...  


In [4]:
!pip install sentence-transformers




In [5]:

!pip install transformers accelerate
!pip install torch
!pip install pandas
!pip install sqlalchemy
!pip install psycopg2-binary




In [6]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models, LoggingHandler
from torch.utils.data import DataLoader
from sqlalchemy import create_engine
import pandas as pd
import logging, torch, os
from datetime import datetime
from datasets import Dataset

# Disable wandb + tokenizer warning
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

logging.basicConfig(format="%(asctime)s - %(message)s",
                    datefmt="%Y-%m-%d %H:%M:%S",
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

engine = create_engine(
    'postgresql+psycopg2://rg5073:rg5073pass@meta_data_postgres:5432/cleaned_meta_data_db',
    pool_size=10, max_overflow=0, pool_timeout=30
)

query = """
SELECT query, chunk_data FROM arxiv_training
WHERE query IS NOT NULL AND LENGTH(TRIM(query)) > 0
LIMIT 200
"""
df = pd.read_sql(query, engine)

train_examples = [InputExample(texts=[row['query'], row['chunk_data']]) for _, row in df.iterrows()]

model_name = 'distilbert-base-uncased'
word_embedding_model = models.Transformer(model_name, max_seq_length=300)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
train_loss = losses.MultipleNegativesRankingLoss(model)

model_save_path = f'output/bert-reranker-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,
    warmup_steps=10,
    show_progress_bar=True,
    use_amp=True,
    optimizer_params={'lr': 2e-5}
)

model.save(model_save_path)
print(f" Model savde to: {model_save_path}")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


2025-04-29 21:22:04 - Use pytorch device_name: cpu


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


2025-04-29 21:25:22 - Save model to output/bert-reranker-20250429-212204
 Model savde to: output/bert-reranker-20250429-212204


In [8]:
from sentence_transformers import SentenceTransformer, util
import torch

model_path = "output/bert-reranker-20250429-212204"
model = SentenceTransformer(model_path)
query = "What is the name of the astrophysics fluid?"
chunk = "arXiv 0801.3931v1 astro ph 25 Jan 2008 Dynamic equations of the astrophysical fluid..."

query_embedding = model.encode(query, convert_to_tensor=True)
chunk_embedding = model.encode(chunk, convert_to_tensor=True)

cosine_score = util.pytorch_cos_sim(query_embedding, chunk_embedding).item()
print(f"Cosine Similarity: {cosine_score:.4f}")


2025-04-29 21:27:22 - Use pytorch device_name: cpu
2025-04-29 21:27:22 - Load pretrained SentenceTransformer: output/bert-reranker-20250429-212204


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine Similarity: 0.7351
