In [1]:
!pip install torch torchvision tqdm numpy pandas sqlalchemy
!pip install psycopg2-binary



In [2]:
import os
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
from sqlalchemy import create_engine, text
import pandas as pd
import tarfile
import shutil
import re
import unicodedata
from tqdm import tqdm 

In [31]:
engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.27.3:5432/cleaned_meta_data_db',
    pool_size=10,
    max_overflow=0,
    pool_timeout=30,
)



In [4]:
query_preview = "SELECT * FROM arxiv_chunks_training_4_phrases1 LIMIT 5;"
preview = pd.read_sql(query_preview, engine)
print(" Data:")
print(preview)

 Data:
      paper_id chunk_no       chunk_id     txt_filename  \
0  0704.0107v1     None  0704.0107v1_5  0704.0107v1.txt   
1  0704.0107v1     None  0704.0107v1_6  0704.0107v1.txt   
2  0704.0107v1     None  0704.0107v1_7  0704.0107v1.txt   
3  0704.0674v2     None  0704.0674v2_1  0704.0674v2.txt   
4  0704.0674v2     None  0704.0674v2_2  0704.0674v2.txt   

                                               query  \
0  ["What is the Parzen's estimator formula used ...   
1  ["How to adapt the model Eq. 19 to experimenta...   
2  ["What is the effect of increasing T on the mo...   
3  ["Galaxy alignment types", "Galaxy group catal...   
4  ["What are the preferential distributions of s...   

                                          chunk_data  
0  lim N 1 N N X x qi, , which we consider as the...  
1  model relative to experimentally estimated fT ...  
2  a new term with the parameters xT , , x g x xT...  
3  arXiv 0704.0674v2 astro ph 8 Jun 2007 Draft ve...  
4  neous samples. This has

In [5]:

with engine.connect() as conn:
    result = conn.execute(text("SELECT COUNT(*) FROM arxiv_chunks_training_4_phrases1;"))
    count = result.scalar()
    print(f"Num of records: {count}")


Total chunks with exactly 3 queries: 6043


In [12]:
inspect_query = """
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_name = 'arxiv_chunks_training_4_phrases1'
"""
df_schema = pd.read_sql(inspect_query, engine)
print(df_schema)


     column_name data_type
0       chunk_no   integer
1       chunk_id      text
2       paper_id      text
3          query      text
4     chunk_data      text
5  query_phrases      text
6   txt_filename      text


In [11]:
from sqlalchemy import text

with engine.connect() as conn:
    conn.execute(text("ALTER TABLE arxiv_chunks_training_4_phrases1 ADD COLUMN query_phrases TEXT;"))
    conn.commit()


In [13]:
!pip install -q keybert sentence-transformers


In [14]:
from keybert import KeyBERT
import json


In [15]:
query = """
SELECT chunk_id, query
FROM arxiv_chunks_training_4_phrases1
WHERE query IS NOT NULL AND query <> ''
"""

df = pd.read_sql(query, engine)
df['query_list'] = df['query'].apply(json.loads)


In [18]:
from sqlalchemy import text
import time

batch_size = 100
rows = df.to_dict(orient="records")

with engine.begin() as conn:
    for i in range(0, len(rows), batch_size):
        batch = rows[i:i + batch_size]
        for row in batch:
            update_stmt = text("""
                UPDATE arxiv_chunks_training_4_phrases1
                SET query_phrases = :phrases
                WHERE chunk_id = :chunk_id
            """)
            conn.execute(update_stmt, {
                'phrases': json.dumps(row['query_phrases']),
                'chunk_id': row['chunk_id']
            })
        print(f"Updated batch {i} to {i + len(batch)}")
        time.sleep(0.1)  


Updated batch 0 to 100
Updated batch 100 to 200
Updated batch 200 to 300
Updated batch 300 to 400
Updated batch 400 to 500
Updated batch 500 to 600
Updated batch 600 to 700
Updated batch 700 to 800
Updated batch 800 to 900
Updated batch 900 to 1000
Updated batch 1000 to 1100
Updated batch 1100 to 1200
Updated batch 1200 to 1300
Updated batch 1300 to 1400
Updated batch 1400 to 1500
Updated batch 1500 to 1600
Updated batch 1600 to 1700
Updated batch 1700 to 1800
Updated batch 1800 to 1900
Updated batch 1900 to 2000
Updated batch 2000 to 2100
Updated batch 2100 to 2200
Updated batch 2200 to 2300
Updated batch 2300 to 2400
Updated batch 2400 to 2500
Updated batch 2500 to 2600
Updated batch 2600 to 2700
Updated batch 2700 to 2800
Updated batch 2800 to 2900
Updated batch 2900 to 3000
Updated batch 3000 to 3100
Updated batch 3100 to 3200
Updated batch 3200 to 3300
Updated batch 3300 to 3400
Updated batch 3400 to 3500
Updated batch 3500 to 3600
Updated batch 3600 to 3700
Updated batch 3700 to 

In [33]:
query_preview = "SELECT * FROM arxiv_chunks_training_4_phrases1 LIMIT 5;"
preview = pd.read_sql(query_preview, engine)
print(" Data:")
print(preview)

 Data:
      paper_id chunk_no        chunk_id     txt_filename  \
0  0704.0107v1     None   0704.0107v1_5  0704.0107v1.txt   
1  0704.0107v1     None   0704.0107v1_6  0704.0107v1.txt   
2  0704.0076v2     None  0704.0076v2_12  0704.0076v2.txt   
3  0704.0107v1     None   0704.0107v1_7  0704.0107v1.txt   
4  0704.0674v2     None   0704.0674v2_1  0704.0674v2.txt   

                                               query  \
0  ["What is the Parzen's estimator formula used ...   
1  ["How to adapt the model Eq. 19 to experimenta...   
2  ["CP asymmetry", "Amplitude C and T", "SU rela...   
3  ["What is the effect of increasing T on the mo...   
4  ["Galaxy alignment types", "Galaxy group catal...   

                                          chunk_data  \
0  lim N 1 N N X x qi, , which we consider as the...   
1  model relative to experimentally estimated fT ...   
2  the CP asymmetry sum rule predicts ACP B0 K0 0...   
3  a new term with the parameters xT , , x g x xT...   
4  arXiv 0704.0

In [20]:
phrases_dict = dict(zip(df['chunk_id'], df['query_phrases']))


In [21]:
with open("query_phrases_by_chunk.json", "w") as f:
    json.dump(phrases_dict, f, indent=2)