In [1]:
import os
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
from sqlalchemy import create_engine, text
import pandas as pd
import tarfile
import shutil
import re
import unicodedata
from tqdm import tqdm 

In [2]:
engine = create_engine(
    'postgresql+psycopg2://rg5073:rg5073pass@meta_data_postgres:5432/cleaned_meta_data_db',
    pool_size=10,
    max_overflow=0,
    pool_timeout=30,
)

In [3]:
query_preview = "SELECT * FROM arxiv_chunks_training_initial_1 LIMIT 5;"
preview = pd.read_sql(query_preview, engine)
print(" Preview of data:")
print(preview)

 Preview of data:
      paper_id chunk_no       chunk_id     txt_filename query  \
0  0904.4879v2     None  0904.4879v2_3  0904.4879v2.txt         
1  0904.1039v1     None  0904.1039v1_1  0904.1039v1.txt         
2  0904.1039v1     None  0904.1039v1_2  0904.1039v1.txt         
3  0904.1039v1     None  0904.1039v1_3  0904.1039v1.txt         
4  0904.1039v1     None  0904.1039v1_4  0904.1039v1.txt         

                                          chunk_data  
0  sulphonate or carboxylate surface groups detai...  
1  1 Kinetics and thermodynamics of carbon segreg...  
2  C in the two dimensional 2D adatom gas rather ...  
3  differences imposed on the upper and lower sla...  
4  the effects of adatom adatom interactions. The...  


In [4]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_initial_1
    WHERE query IS NOT NULL
"""
df = pd.read_sql(check_query, engine)

df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

for _, row in df.iterrows():
    print(f"\n Paper ID: {row['paper_id']}")
    print(f" Chunk ID: {row['chunk_id']}")
    print("Queries:")
    for i, q in enumerate(row["query_list"], 1):
        print(f"  {i}. {q}")



 Paper ID: 0904.0001v1
 Chunk ID: 0904.0001v1_1
Queries:
  1. lattice eigenstates
  2. Lattice - qs
  3. illustrative

 Paper ID: 0904.0001v1
 Chunk ID: 0904.0001v1_2
Queries:
  1. we apply probability theory to the varying eigenenergies
  2. We first explore parameters with real BA quasimomenta
  3. low lying N levels

 Paper ID: 0904.0002v2
 Chunk ID: 0904.0002v2_11
Queries:
  1. NNNN
  2. LOCAL, ORIGINAL, TEXOT
  3. color

 Paper ID: 0904.0002v2
 Chunk ID: 0904.0002v2_7
Queries:
  1. clumpy ISM is disproportional to the number of pixels that contain the clumpy ISM
  2. A clumpy ISM
  3. parameter 2 is a clumpy ISM.

 Paper ID: 0904.0002v2
 Chunk ID: 0904.0002v2_1
Queries:
  1. EXPRESS
  2. ABSTRACT Models
  3. ABSTRACT

 Paper ID: 0904.0002v2
 Chunk ID: 0904.0002v2_13
Queries:
  1. 5000A
  2. IUPAC
  3. 550A , the SED is sensitive to stars which appear to turn off mass

 Paper ID: 0904.0001v1
 Chunk ID: 0904.0001v1_3
Queries:
  1. arrows denote potential changes in the position of 

In [5]:
!pip install datasets



In [6]:
from datasets import Dataset


In [7]:

!pip install -q sentence-transformers transformers accelerate torch pandas sqlalchemy psycopg2-binary

import os
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, models, util, LoggingHandler
import pandas as pd
from sqlalchemy import create_engine
import json
import logging
from datetime import datetime
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.basicConfig(format="%(asctime)s - %(message)s",
                    datefmt="%Y-%m-%d %H:%M:%S",
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

engine = create_engine(
    'postgresql+psycopg2://rg5073:rg5073pass@meta_data_postgres:5432/cleaned_meta_data_db',
    pool_size=10,
    max_overflow=0,
    pool_timeout=30
)

query = """
SELECT query, chunk_data FROM arxiv_chunks_training_initial_1
WHERE query IS NOT NULL AND LENGTH(TRIM(query)) > 0
LIMIT 200
"""
df = pd.read_sql(query, engine)
train_examples = []
for _, row in df.iterrows():
    try:
        query_list = json.loads(row['query']) if isinstance(row['query'], str) else row['query']
        for q in query_list:
            if isinstance(q, str) and len(q.strip()) > 0:
                train_examples.append(InputExample(texts=[q.strip(), row['chunk_data']]))
    except Exception as e:
        print(f" Skipping row due to error: {e}")

model_name = 'distilbert-base-uncased'
word_embedding_model = models.Transformer(model_name, max_seq_length=300)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
train_loss = losses.MultipleNegativesRankingLoss(model)

model_save_path = f'output/bert-3queries-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,
    warmup_steps=10,
    show_progress_bar=True,
    use_amp=True,
    optimizer_params={'lr': 2e-5}
)
model.save(model_save_path)
print(f"\n Model saved to: {model_save_path}")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


2025-05-02 18:30:24 - Use pytorch device_name: cpu


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


2025-05-02 18:31:12 - Save model to output/bert-3queries-20250502-183024

 Model saved to: output/bert-3queries-20250502-183024
