In [2]:
!pip install torch torchvision tqdm numpy pandas sqlalchemy
!pip install psycopg2-binary



In [3]:
import os
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
from sqlalchemy import create_engine, text
import pandas as pd
import tarfile
import shutil
import re
import unicodedata
from tqdm import tqdm 

In [4]:
engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.27.14:30002/cleaned_meta_data_db',
    pool_size=10,
    max_overflow=0,
    pool_timeout=30,
)



In [None]:
'model' in globals()


In [6]:
query_preview = "SELECT * FROM arxiv_chunks_training_4 LIMIT 5;"
preview = pd.read_sql(query_preview, engine)
print(" Data:")
print(preview)

 Data:
      paper_id chunk_no       chunk_id     txt_filename query  \
0  0704.1728v1     None  0704.1728v1_1  0704.1728v1.txt         
1  0704.1728v1     None  0704.1728v1_2  0704.1728v1.txt         
2  0704.1728v1     None  0704.1728v1_3  0704.1728v1.txt         
3  0704.1728v1     None  0704.1728v1_4  0704.1728v1.txt         
4  0704.1479v2     None  0704.1479v2_1  0704.1479v2.txt         

                                          chunk_data  
0  arXiv 0704.1728v1 gr qc 13 Apr 2007 April 2007...  
1  Z L q d . The Lagrangian L depends only on the...  
2  involved form. We want now to investigate the ...  
3  space, instead of configuration space, or perh...  
4  arXiv 0704.1479v2 cond mat.mes hall 12 Apr 200...  


In [7]:
import json
import pandas as pd
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_3
    WHERE query IS NOT NULL
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)


for _, row in df.iterrows():
    print(f"\n Paper ID: {row['paper_id']}")
    print(f" Chunk ID: {row['chunk_id']}")
    print("Queries:")
    for i, q in enumerate(row["query_list"], 1):
        print(f"  {i}. {q}")


In [10]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_2
    WHERE query IS NOT NULL
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 1889


In [11]:
df_tables = pd.read_sql("""
    SELECT table_schema, table_name 
    FROM information_schema.tables 
    WHERE table_type = 'BASE TABLE'
    ORDER BY table_schema, table_name;
""", engine)
pd.set_option('display.max_rows', None)
print(df_tables)


          table_schema                       table_name
0   information_schema                     sql_features
1   information_schema          sql_implementation_info
2   information_schema                        sql_parts
3   information_schema                       sql_sizing
4           pg_catalog                     pg_aggregate
5           pg_catalog                            pg_am
6           pg_catalog                          pg_amop
7           pg_catalog                        pg_amproc
8           pg_catalog                       pg_attrdef
9           pg_catalog                     pg_attribute
10          pg_catalog                  pg_auth_members
11          pg_catalog                        pg_authid
12          pg_catalog                          pg_cast
13          pg_catalog                         pg_class
14          pg_catalog                     pg_collation
15          pg_catalog                    pg_constraint
16          pg_catalog                    pg_con

In [12]:
with engine.connect() as conn:
    conn.execute(text("""
        DROP TABLE IF EXISTS arxiv_chunks_training_5;
        CREATE TABLE arxiv_chunks_training_5 AS 
        TABLE arxiv_chunks_training_initial;
    """))
    conn.commit()

In [13]:
df_tables = pd.read_sql("""
    SELECT table_schema, table_name 
    FROM information_schema.tables 
    WHERE table_type = 'BASE TABLE'
    ORDER BY table_schema, table_name;
""", engine)
pd.set_option('display.max_rows', None)
print(df_tables)


          table_schema                       table_name
0   information_schema                     sql_features
1   information_schema          sql_implementation_info
2   information_schema                        sql_parts
3   information_schema                       sql_sizing
4           pg_catalog                     pg_aggregate
5           pg_catalog                            pg_am
6           pg_catalog                          pg_amop
7           pg_catalog                        pg_amproc
8           pg_catalog                       pg_attrdef
9           pg_catalog                     pg_attribute
10          pg_catalog                  pg_auth_members
11          pg_catalog                        pg_authid
12          pg_catalog                          pg_cast
13          pg_catalog                         pg_class
14          pg_catalog                     pg_collation
15          pg_catalog                    pg_constraint
16          pg_catalog                    pg_con

In [9]:
!pip install transformers sqlalchemy tqdm pandas torch




In [10]:
!pip install accelerate
import accelerate



In [19]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from sqlalchemy import create_engine, text
from tqdm import tqdm
import pandas as pd
import torch
import json

model_name = "openchat/openchat-3.5-1210"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    trust_remote_code=True
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.27.14:30002/cleaned_meta_data_db',
    pool_size=10, max_overflow=0, pool_timeout=30
)

query = """
    SELECT paper_id, chunk_id, chunk_data
    FROM arxiv_chunks_training_4
    ORDER BY chunk_id
    LIMIT 30
"""
df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"""You are a helpful assistant. Generate 3 search queries based on the following academic chunk:
###
Chunk: {r['chunk_data']}
###
Queries:"""
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.95,
            num_return_sequences=3,
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    grouped_outputs = [decoded_outputs[j:j+3] for j in range(0, len(decoded_outputs), 3)]

    for record, phrases in zip(batch, grouped_outputs):
        print(f"\n Raw model outputs for chunk {record['chunk_id']} (Paper: {record['paper_id']}):")
        for idx, p in enumerate(phrases, 1):
            print(f"[Output {idx}]: {p}")

        cleaned = []
        for phrase in phrases:
            for line in phrase.split("\n"):
                line = line.strip()
                if any(c.isalpha() for c in line) and len(line) > 3:
                    cleaned.append(line)
                    break

        if len(cleaned) == 3:
            print(f" 3 clean phrases extracted for chunk {record['chunk_id']}")
        else:
            print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid phrases")

print(f"\n{len(records)} valid chunks processed — 3 phrases printed per chunk if valid.")


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  2.48it/s]
 12%|█▎        | 1/8 [00:05<00:38,  5.55s/it]


 Raw model outputs for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
[Output 1]: You are a helpful assistant. Generate 3 search queries based on the following academic chunk:
###
Chunk: arXiv 0704.0001v2 hep ph 24 Jul 2007 ANL HEP PR 07 12, arXiv 0704.0001 Cal ulation of prompt diphoton pro du tion ross se tions at T ev atron and LHC energies C. Bala zs1 , E. L. Berger1 , P . Nadolsky1 , and C. P . Y uan2 1 High Ener gy Physi s Division, A r gonne National L ab or atory, A r gonne, IL 60439 2 Dep artment of Physi s and Astr onomy, Mi higan State University, East L ansing, MI 48824 Dated Ma y 3, 2007 Abstra t A fully di eren tial al ulation in p erturbativ e quan tum hromo dynami s is presen ted for the pro du tion of massiv e photon pairs at hadron olliders. All next to leading order p erturbativ e on tributions from quark an tiquark, gluon an ti quark, and gluon gluon subpro esses are in luded, as w ell as all orders resummation of initial state gluon radiation v alid at next to next to 

 25%|██▌       | 2/8 [00:10<00:32,  5.35s/it]


 Raw model outputs for chunk 0704.0001v2_13 (Paper: 0704.0001v2):
[Output 1]: You are a helpful assistant. Generate 3 search queries based on the following academic chunk:
###
Chunk: o pres riptions yield iden ti al predi tions outside of this restri ted region, notably at , where our NLO p erturbativ e expression P Q, QT, y, in the q q qg hannel is on trolled only b y quasi exp erimen tal isolation and oin ides with the orresp onding dire t ross se tion in DIPHO X. The default subtra tion pres ription predi ts a v anishing d dQT in the extreme QT 0 limit, while the smo oth one pres ription has an in tegrable singularit y in this limit, a v oided b y an expli it small QT uto in the al ulation of our Y pie e. Both pres riptions are free of the logarithmi singularit y at DIPHO X al ulation. 2. L ow Q diphoton fr agmentation Another lass of large radiativ e orre tions arises when the in v arian t mass Q is smaller than the transv erse momen tum QT . In this ase, one nal state quark or gl

 38%|███▊      | 3/8 [00:15<00:26,  5.30s/it]


 Raw model outputs for chunk 0704.0001v2_17 (Paper: 0704.0001v2):
[Output 1]: You are a helpful assistant. Generate 3 search queries based on the following academic chunk:
###
Chunk: along with the CDF data a the xed order predi tion P dashes and its asymptoti appro ximation A dots b the full resummed ross se tion solid , obtained b y mat hing the resummed W Y to the xed order predi tion P dashed, same as in a at large QT . b et w een the P and A distributions in ludes the nite regular terms not in luded in A and logarithmi terms from the nal state fragmen tation singularities, with the latter subtra ted when , as des rib ed in Se . I I C . The data learly disfa v or the xed order predi tion in the region of lo w QT . Figure 5 b features the resummed W Y on tribution solid urv e . Resummation of the initial state logarithmi terms renders W nite in the region of small QT . The sum of W and Y in ludes the resummed initial state singular on tributions plus the remaining relev an t terms 

 50%|█████     | 4/8 [00:21<00:21,  5.28s/it]


 Raw model outputs for chunk 0704.0001v2_20 (Paper: 0704.0001v2):
[Output 1]: You are a helpful assistant. Generate 3 search queries based on the following academic chunk:
###
Chunk: I I C . The one fragmen tation on tribution is enhan ed on a v erage b y 400 if Eiso T is in reased in the al ulation from 1 to 4 Ge V. The rate in the shoulder region is enhan ed further if the fa torization s ale F is redu ed. Sin e the theoreti al sp e i ations for isolation and for the fragmen tation on tribution are admittedly appro ximate, w e question whether great imp ortan e should b e pla ed on the agreemen t of theory and exp erimen t in the region of small or in the shoulder region in the QT distribution. A straigh tforw ard w a y to redu e sensitivit y to fragmen tation is to require Q, as dis ussed ab o v e. The t w o uts ha v e similar e e ts on the ev en t distributions. Figure 8 sho ws the e e ts of the distributions. The ut and the region of small altogether , while only a small p ortion

 62%|██████▎   | 5/8 [00:26<00:15,  5.27s/it]


 Raw model outputs for chunk 0704.0001v2_24 (Paper: 0704.0001v2):
[Output 1]: You are a helpful assistant. Generate 3 search queries based on the following academic chunk:
###
Chunk: w the DIPHO X rate at all Q. The largest di eren e o urs at the lo w est v alues of Q b elo w the uto , where the rates an di er b y a fa tor of 2. In this region, orresp onding to diphoton ev en ts with small and QT larger than Q, the photon fragmen tation on tributions in luded in the DIPHO X al ulation are large in omparison to the dire t rate. Finally , w e note that the in tegrated rate in DIPHO X is more stable with resp e t to v ariations in Eiso T than the di eren tial distributions in DIPHO X, b e ause Eiso T dep enden e for Eiso T . T o obtain the nal pro du tion ross se tions, after in lusion of all hannels, w e om bine the resp e tiv e q q qg results with the resummed NLO gg gqS ross se tion in our ase and with the LO gg ross se tion in the DIPHO X ase. The distributions in the in v arian t ma

 75%|███████▌  | 6/8 [00:31<00:10,  5.20s/it]


 Raw model outputs for chunk 0704.0001v2_28 (Paper: 0704.0001v2):
[Output 1]: You are a helpful assistant. Generate 3 search queries based on the following academic chunk:
###
Chunk: cos distributions are go o d dis riminators b et w een the Higgs b oson signal and ba kground in su h an analysis. IV. CONCLUSIONS The theoreti al study of on tin uum diphoton pro du tion in hadron ollisions is in teresting and v aluable for sev eral reasons there are data from the CDF and D ollab orations at F ermilab with the promise of larger ev en t samples there are new theoreti al hallenges asso iated with all orders soft gluon resummation of t w o lo op amplitudes and on tin uum diphotons are a large standard mo del ba kground ab o v e whi h one ma y observ e the pro du ts of Higgs b oson de a y in to a pair of photons at the LHC. In this pap er and Refs. 2, 3 , w e presen t our al ulation of the fully di eren tial ross se tion d dQdQTdyd as a fun tion of the mass Q, transv erse momen tum QT , and 

 88%|████████▊ | 7/8 [00:36<00:05,  5.22s/it]


 Raw model outputs for chunk 0704.0001v2_31 (Paper: 0704.0001v2):
[Output 1]: You are a helpful assistant. Generate 3 search queries based on the following academic chunk:
###
Chunk: set of s ales C1 b and C2Q an b e expressed in terms of its v alue F n,c obtained for the anoni al om bination 1. Here c0 2e E 1.123, where .5772 . . . is the Euler onstan t. The relationships b et w een F n and F n,c tak e the form A a C1 A 1,c a A1 A a C1 A 2,c a A 1,c a 0 ln c0 C1 A2 A a C1 A 3,c a 2A 2,c a 0 ln c0 C1 A 1,c a 2 1 ln c0 C1 A 1,c a 2 0 ln c0 C1 2 A3 B a C1, C2 B 1,c a A 1,c a ln c2 0C2 2 C2 1 A4 B a C1, C2 B 2,c a A 2,c a ln c2 0C2 2 C2 1 0 A 1,c a ln2 c0 C1 B 1,c a ln C2 A 1,c a ln2 C2 A5 C a a1 x, b , C1 C2 C 1,c a a1 x aa1 1 x B 1,c a 2 ln c2 0C2 2 C2 1 A 1,c a 4 ln c2 0C2 2 C2 1 2! Pa a1 x ln b c0 . A6 They dep end on the QCD b eta fun tion o e ien ts 11Nc 2Nf 6 , 17N2 c 5NcNf 3CFNf 6 for Nc olors and Nf a tiv e quark a v ors, with N2 c 1 2Nc 4 3 for . The relev an t O s splitting fu

100%|██████████| 8/8 [00:40<00:00,  5.02s/it]


 Raw model outputs for chunk 0704.0001v2_4 (Paper: 0704.0001v2):
[Output 1]: You are a helpful assistant. Generate 3 search queries based on the following academic chunk:
###
Chunk: a fragmen tation on tribution of en tirely di eren t nature arises when the pair is relativ ely ligh t and pro du ed from fragmen tation of one parton, as dis ussed in Se s. I I C 2 and I I I A 3. A full and onsisten t treatmen t of the nal state logarithms b ey ond lo w est order w ould require a join t resummation of the initial and nal state logarithmi singularities. In the w ork rep orted here, w e are guided b y our in terest in des ribing the ross se tion for isolate d photons, in whi h the fragmen tation on tributions are largely suppressed. A t ypi al isolation ondition requires the hadroni a tivit y to b e minimal e.g., omparable to the underlying ev en t in the immediate neigh b orho o d of ea h andidate photon. Candidate photons an b e reje ted b y energy dep osit nearb y in the hadroni alorimet




The above run has not been great as it is producing queries properly as it has  You are a helpful assistant and other irrevelant output

In [26]:

query = """
    SELECT paper_id, chunk_id, chunk_data
    FROM arxiv_chunks_training_4
    ORDER BY chunk_id
    LIMIT 20
"""
df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.95,
            num_return_sequences=1
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    with engine.begin() as connection:
        for record, response in zip(batch, decoded_outputs):
            print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

            cleaned = []
            matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
            for match in matches:
                line = match.strip()
                if any(c.isalpha() for c in line):
                    cleaned.append(line)
                if len(cleaned) == 3:
                    break

            if len(cleaned) == 3:
                connection.execute(text("""
                    UPDATE arxiv_chunks_training_4
                    SET query = :query_data
                    WHERE paper_id = :pid AND chunk_id = :cid
                """), {
                    "query_data": json.dumps(cleaned),
                    "pid": record["paper_id"],
                    "cid": record["chunk_id"]
                })
                print(f" Stored 3 queries for chunk {record['chunk_id']}")
            else:
                print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")


  0%|          | 0/5 [00:00<?, ?it/s]


Raw output for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: arXiv 0704.0001v2 hep ph 24 Jul 2007 ANL HEP PR 07 12, arXiv 0704.0001 Cal ulation of prompt diphoton pro du tion ross se tions at T ev atron and LHC energies C. Bala zs1 , E. L. Berger1 , P . Nadolsky1 , and C. P . Y uan2 1 High Ener gy Physi s Division, A r gonne National L ab or atory, A r gonne, IL 60439 2 Dep artment of Physi s and Astr onomy, Mi higan State University, East L ansing, MI 48824 Dated Ma y 3, 2007 Abstra t A fully di eren tial al ulation in p erturbativ e quan tum hromo dynami s is presen ted for the pro du tion of massiv e photon pairs at hadron olliders. All next to leading order p erturbativ e on tributions from quark an tiquark, gluon an ti quark, and gluon gluon subpro esses are in luded, as w ell as all orders resummation of initial state gluon radiat

 20%|██        | 1/5 [00:08<00:35,  8.90s/it]

 Stored 3 queries for chunk 0704.0001v2_12

Raw output for chunk 0704.0001v2_13 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: o pres riptions yield iden ti al predi tions outside of this restri ted region, notably at , where our NLO p erturbativ e expression P Q, QT, y, in the q q qg hannel is on trolled only b y quasi exp erimen tal isolation and oin ides with the orresp onding dire t ross se tion in DIPHO X. The default subtra tion pres ription predi ts a v anishing d dQT in the extreme QT 0 limit, while the smo oth one pres ription has an in tegrable singularit y in this limit, a v oided b y an expli it small QT uto in the al ulation of our Y pie e. Both pres riptions are free of the logarithmi singularit y at DIPHO X al ulation. 2. L ow Q diphoton fr agmentation Another lass of large radiativ e orre tions arises when the in v arian t mass Q is smaller t

 40%|████      | 2/5 [00:17<00:26,  8.85s/it]

 Stored 3 queries for chunk 0704.0001v2_15

Raw output for chunk 0704.0001v2_16 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: photon pairs, sho wn in Fig. 4 as solid and dashed lines, resp e tiv ely . The nite order ross se tion is ev aluated at O s a ura y in the q q qg hannel and at O 3 s a ura y in the gg gqS hannel. These nite order al ulations are p erformed with the 15 pp X, .96 TeV Q GeV d dQ pb GeV Resummed NNLL Fixed order NLO CDF, 207 pb 1 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 10 20 30 40 50 60 70 80 90 100 Figure 4 In v arian t mass distributions of photon pairs in p p X at .96 T e V with QCD on tributions al ulated in the soft gluon resummation formalism red solid and at NLO blue dashed . The al ulations in lude the uts used b y the CDF ollab oration whose data are sho wn 1 . phase spa e sli ing metho d des rib ed in Se . I I B . When in tegrate

 60%|██████    | 3/5 [00:27<00:18,  9.07s/it]

 Stored 3 queries for chunk 0704.0001v2_2

Raw output for chunk 0704.0001v2_20 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: I I C . The one fragmen tation on tribution is enhan ed on a v erage b y 400 if Eiso T is in reased in the al ulation from 1 to 4 Ge V. The rate in the shoulder region is enhan ed further if the fa torization s ale F is redu ed. Sin e the theoreti al sp e i ations for isolation and for the fragmen tation on tribution are admittedly appro ximate, w e question whether great imp ortan e should b e pla ed on the agreemen t of theory and exp erimen t in the region of small or in the shoulder region in the QT distribution. A straigh tforw ard w a y to redu e sensitivit y to fragmen tation is to require Q, as dis ussed ab o v e. The t w o uts ha v e similar e e ts on the ev en t distributions. Figure 8 sho ws the e e ts of the distributions.

 80%|████████  | 4/5 [00:35<00:08,  8.77s/it]

 Stored 3 queries for chunk 0704.0001v2_22

Raw output for chunk 0704.0001v2_23 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: es not v ary strongly with x 3 . 3. Final state fr agmentation and omp arison with DIPHO X The impa t of the nal state fragmen tation at the LHC an b e ev aluated if w e ompare our results with DIPHO X predi tions. The transv erse momen tum and in v arian t mass dis tributions in the q q qg hannel from the t w o approa hes are sho wn in Fig. 12 . In b oth al ulations, quasi exp erimen tal isolation remo v es dire t NLO ev en ts with ollinear nal state photons and partons when 15 Ge V, but not when QT is b elo w Eiso T . Con en trating rst on ev en ts with , w e observ e that, at , the resummed q q qg ross se tion redu es to the dire t xed order ross se tion, ev aluated in the same w a y as in the DIPHO X o de. Our resummed and the di

100%|██████████| 5/5 [00:44<00:00,  8.94s/it]

 Stored 3 queries for chunk 0704.0001v2_27





In [27]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 16


In [None]:

query = """
    SELECT paper_id, chunk_id, chunk_data
    FROM arxiv_chunks_training_4
    ORDER BY chunk_id
    LIMIT 20
"""
df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.95,
            num_return_sequences=1
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    with engine.begin() as connection:
        for record, response in zip(batch, decoded_outputs):
            print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

            cleaned = []
            matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
            for match in matches:
                line = match.strip()
                if any(c.isalpha() for c in line):
                    cleaned.append(line)
                if len(cleaned) == 3:
                    break

            if len(cleaned) == 3:
                connection.execute(text("""
                    UPDATE arxiv_chunks_training_4
                    SET query = :query_data
                    WHERE paper_id = :pid AND chunk_id = :cid
                """), {
                    "query_data": json.dumps(cleaned),
                    "pid": record["paper_id"],
                    "cid": record["chunk_id"]
                })
                print(f" Stored 3 queries for chunk {record['chunk_id']}")
            else:
                print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from sqlalchemy import create_engine, text
from tqdm import tqdm
import pandas as pd
import torch
import json

model_name = "openchat/openchat-3.5-1210"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    trust_remote_code=True
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.27.14:30002/cleaned_meta_data_db',
    pool_size=10, max_overflow=0, pool_timeout=30
)


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  2.41it/s]


In [12]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 20
"""



df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.95,
            num_return_sequences=1
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    with engine.begin() as connection:
        for record, response in zip(batch, decoded_outputs):
            print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

            cleaned = []
            matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
            for match in matches:
                line = match.strip()
                if any(c.isalpha() for c in line):
                    cleaned.append(line)
                if len(cleaned) == 3:
                    break

            if len(cleaned) == 3:
                connection.execute(text("""
                    UPDATE arxiv_chunks_training_4
                    SET query = :query_data
                    WHERE paper_id = :pid AND chunk_id = :cid
                """), {
                    "query_data": json.dumps(cleaned),
                    "pid": record["paper_id"],
                    "cid": record["chunk_id"]
                })
                print(f" Stored 3 queries for chunk {record['chunk_id']}")
            else:
                print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")


  0%|          | 0/5 [00:00<?, ?it/s]


Raw output for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: arXiv 0704.0001v2 hep ph 24 Jul 2007 ANL HEP PR 07 12, arXiv 0704.0001 Cal ulation of prompt diphoton pro du tion ross se tions at T ev atron and LHC energies C. Bala zs1 , E. L. Berger1 , P . Nadolsky1 , and C. P . Y uan2 1 High Ener gy Physi s Division, A r gonne National L ab or atory, A r gonne, IL 60439 2 Dep artment of Physi s and Astr onomy, Mi higan State University, East L ansing, MI 48824 Dated Ma y 3, 2007 Abstra t A fully di eren tial al ulation in p erturbativ e quan tum hromo dynami s is presen ted for the pro du tion of massiv e photon pairs at hadron olliders. All next to leading order p erturbativ e on tributions from quark an tiquark, gluon an ti quark, and gluon gluon subpro esses are in luded, as w ell as all orders resummation of initial state gluon radiat

 20%|██        | 1/5 [00:08<00:33,  8.39s/it]

 Stored 3 queries for chunk 0704.0001v2_23

Raw output for chunk 0704.0001v2_28 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: cos distributions are go o d dis riminators b et w een the Higgs b oson signal and ba kground in su h an analysis. IV. CONCLUSIONS The theoreti al study of on tin uum diphoton pro du tion in hadron ollisions is in teresting and v aluable for sev eral reasons there are data from the CDF and D ollab orations at F ermilab with the promise of larger ev en t samples there are new theoreti al hallenges asso iated with all orders soft gluon resummation of t w o lo op amplitudes and on tin uum diphotons are a large standard mo del ba kground ab o v e whi h one ma y observ e the pro du ts of Higgs b oson de a y in to a pair of photons at the LHC. In this pap er and Refs. 2, 3 , w e presen t our al ulation of the fully di eren tial ross se tio

 40%|████      | 2/5 [00:17<00:26,  8.88s/it]

 Stored 3 queries for chunk 0704.0001v2_30

Raw output for chunk 0704.0001v2_31 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: set of s ales C1 b and C2Q an b e expressed in terms of its v alue F n,c obtained for the anoni al om bination 1. Here c0 2e E 1.123, where .5772 . . . is the Euler onstan t. The relationships b et w een F n and F n,c tak e the form A a C1 A 1,c a A1 A a C1 A 2,c a A 1,c a 0 ln c0 C1 A2 A a C1 A 3,c a 2A 2,c a 0 ln c0 C1 A 1,c a 2 1 ln c0 C1 A 1,c a 2 0 ln c0 C1 2 A3 B a C1, C2 B 1,c a A 1,c a ln c2 0C2 2 C2 1 A4 B a C1, C2 B 2,c a A 2,c a ln c2 0C2 2 C2 1 0 A 1,c a ln2 c0 C1 B 1,c a ln C2 A 1,c a ln2 C2 A5 C a a1 x, b , C1 C2 C 1,c a a1 x aa1 1 x B 1,c a 2 ln c2 0C2 2 C2 1 A 1,c a 4 ln c2 0C2 2 C2 1 2! Pa a1 x ln b c0 . A6 They dep end on the QCD b eta fun tion o e ien ts 11Nc 2Nf 6 , 17N2 c 5NcNf 3CFNf 6 for Nc olors and Nf a tiv e

 60%|██████    | 3/5 [00:25<00:17,  8.60s/it]

 Stored 3 queries for chunk 0704.0001v2_34

Raw output for chunk 0704.0001v2_4 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: a fragmen tation on tribution of en tirely di eren t nature arises when the pair is relativ ely ligh t and pro du ed from fragmen tation of one parton, as dis ussed in Se s. I I C 2 and I I I A 3. A full and onsisten t treatmen t of the nal state logarithms b ey ond lo w est order w ould require a join t resummation of the initial and nal state logarithmi singularities. In the w ork rep orted here, w e are guided b y our in terest in des ribing the ross se tion for isolate d photons, in whi h the fragmen tation on tributions are largely suppressed. A t ypi al isolation ondition requires the hadroni a tivit y to b e minimal e.g., omparable to the underlying ev en t in the immediate neigh b orho o d of ea h andidate photon. Candidate ph

 80%|████████  | 4/5 [00:35<00:08,  8.89s/it]

 Stored 3 queries for chunk 0704.0001v2_7

Raw output for chunk 0704.0001v2_8 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: at a fa torization s ale and summed o v er in termediate parton a v ors a1 Ca a1 fa1 h x, b X a1 Z 1 x d Ca a1 x , b C1 C2 , fa1 h , . W e ompute the fun tions ha, Aa , Ba and Ca a1 up to orders s, 3 s, 2 s, and s, resp e tiv ely , orresp onding to the NNLL a ura y of resummation. The p erturbativ e o e ien ts at these orders in s are listed in App endix A. The subleading on tribution from the nonp erturbativ e region b 1 Ge V 1 is in luded in our al ulation using a revised b mo del 18 , whi h pro vides ex ellen t agreemen t with pT dep enden t data on Drell Y an pair and Z b oson pro du tion. In this mo del, the p erturbativ e form fa tor f Wpert Q, b , y, in Eq. is ev aluated as a fun tion of b b 1 b2 b2 max 1 2, with .5 Ge V 1 . The

100%|██████████| 5/5 [00:44<00:00,  8.88s/it]

 Stored 3 queries for chunk 0704.0002v2_10





In [13]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 32


In [14]:
import json
import pandas as pd
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)


for _, row in df.iterrows():
    print(f"\n Paper ID: {row['paper_id']}")
    print(f" Chunk ID: {row['chunk_id']}")
    print("Queries:")
    for i, q in enumerate(row["query_list"], 1):
        print(f"  {i}. {q}")



 Paper ID: 0704.0001v2
 Chunk ID: 0704.0001v2_29
Queries:
  1. What is the effect of varying Eiso T on the DIPHO X cross section?
  2. How do the isolation criteria in DIPHO X affect the photon fragmentation contributions at NLO?
  3. Does the olinear subtraction prescription for defining an isolated photon agree well with experimental data in the diphoton production?

 Paper ID: 0704.0001v2
 Chunk ID: 0704.0001v2_30
Queries:
  1. How do uts affect the fragmentation and enhancement at low and intermediate QT in the shoulder region?
  2. Can you provide a summary of perturbative coefficients in the context of diphoton data analysis?
  3. Are additional logarithmic singularities and fragmentations observed in the region QT Q?

 Paper ID: 0704.0001v2
 Chunk ID: 0704.0001v2_34
Queries:
  1. What is the relationship between Z. Bern, L. J. Dixon, D. C. Dunbar, and D. A. Kosowsky's work in the fields of particle physics and nuclear physics?
  2. Compare and contrast the contributions of Z. B

In [15]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 20
"""



df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=40,
            do_sample=True,
            temperature=0.95,
            num_return_sequences=1
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    with engine.begin() as connection:
        for record, response in zip(batch, decoded_outputs):
            print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

            cleaned = []
            matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
            for match in matches:
                line = match.strip()
                if any(c.isalpha() for c in line):
                    cleaned.append(line)
                if len(cleaned) == 3:
                    break

            if len(cleaned) == 3:
                connection.execute(text("""
                    UPDATE arxiv_chunks_training_4
                    SET query = :query_data
                    WHERE paper_id = :pid AND chunk_id = :cid
                """), {
                    "query_data": json.dumps(cleaned),
                    "pid": record["paper_id"],
                    "cid": record["chunk_id"]
                })
                print(f" Stored 3 queries for chunk {record['chunk_id']}")
            else:
                print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")


 20%|██        | 1/5 [00:01<00:05,  1.38s/it]


Raw output for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: arXiv 0704.0001v2 hep ph 24 Jul 2007 ANL HEP PR 07 12, arXiv 0704.0001 Cal ulation of prompt diphoton pro du tion ross se tions at T ev atron and LHC energies C. Bala zs1 , E. L. Berger1 , P . Nadolsky1 , and C. P . Y uan2 1 High Ener gy Physi s Division, A r gonne National L ab or atory, A r gonne, IL 60439 2 Dep artment of Physi s and Astr onomy, Mi higan State University, East L ansing, MI 48824 Dated Ma y 3, 2007 Abstra t A fully di eren tial al ulation in p erturbativ e quan tum hromo dynami s is presen ted for the pro du tion of massiv e photon pairs at hadron olliders. All next to leading order p erturbativ e on tributions from quark an tiquark, gluon an ti quark, and gluon gluon subpro esses are in luded, as w ell as all orders resummation of initial state gluon radiat

 40%|████      | 2/5 [00:03<00:06,  2.05s/it]

 Stored 3 queries for chunk 0704.0002v2_2

Raw output for chunk 0704.0002v2_3 (Paper: 0704.0002v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: one. Table 1 contains the decomposition terminology used in this paper. The decomposition problem. We define the decomposition problem for sparse graphs as tak ing a graph as its input and producing as output, a decomposition that can be used to certify spar sity. In this paper, we will study three kinds of outputs maps and trees proper lTk decompositions and the pebble game with colors decomposition, which is defined in the next section. 2. Historical background The well known theorems of Tutte and Nash Williams relate the k,k tight graphs to the existence of decompositions into edge disjoint spanning trees. Taking a matroidal viewpoint, 4 Ileana Streinu, Louis Theran 0 1 2 3 4 5 a 0 1 2 3 4 5 b 0 1 2 3 4 5 c Fig. 2. a A graph with a 

 60%|██████    | 3/5 [00:06<00:04,  2.24s/it]

 Stored 3 queries for chunk 0704.0002v2_4

Raw output for chunk 0704.0002v2_5 (Paper: 0704.0002v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: outi v Number of edges vw colored ci for v V Table 2. Pebble game notation used in this paper. nected subgraphs of G there may be more than one of the same color . Such a monochromatic subgraph is called a map graph piece of G if it contains a cycle in G and a tree piece of G otherwise. The set of tree pieces of G is the collection of tree pieces induced by G . As with the corresponding definition for lTks, the set of tree pieces is defined relative to a specific sub graph in particular a tree piece may be part of a larger cycle that includes edges not spanned by G . The properties of pebble game decompositions are studied in Section 6, and Theorem 2 shows that each color must be 1,0 sparse. The orientation of the edges in Figure 4 a s

 80%|████████  | 4/5 [00:07<00:01,  1.89s/it]


Raw output for chunk 0704.0002v2_7 (Paper: 0704.0002v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: w. It follows that any sparse graph has a pebble game construction. Theorem 1 Sparse graphs and pebble game graphs coincide . A graph G is k,l sparse with 0 l 2k 1 if and only if G is a pebble game graph. 6. The pebble game with colors decomposition In this section we prove Theorem 2, which characterizes all pebble game decompositions. We start with the following lemmas about the structure of monochromatic connected components in H, the directed graph maintained during the pebble game. Sparsity certifying Graph Decompositions 9 Lemma 10 Monochromatic pebble game subgraphs are 1,0 sparse . Let Hi be the sub graph of H induced by edges with pebbles of color ci on them. Then Hi is 1,0 sparse, for ,...,k. Proof. By I4 Hi is a set of edges with out degree at most one for every vert

100%|██████████| 5/5 [00:09<00:00,  1.82s/it]


Raw output for chunk 0704.0003v3_2 (Paper: 0704.0003v3):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: is anomalously high because the tidal force is close to a resonance in the response function of ocean Brush 1983 . Kagan gave a detailed review about those tidal friction models Kagan 1997 . Those models are based on many assumptions about geological continental position and drifting and physical conditions in the past, and many parameters such as phase lag angle, multi mode approximation with time dependent frequencies of the resonance modes, etc. have to be introduced and carefully adjusted to make their predictions close to the geological evidence. However, those assumptions and parameters are still challenged, to certain extent, as concoction. The second possible scenario is that another mechanism could dominate the evolution of the Earth Moon system and the role of the ti




In [18]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 20
"""



df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.95,
            num_return_sequences=1
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    with engine.begin() as connection:
        for record, response in zip(batch, decoded_outputs):
            print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

            cleaned = []
            matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
            for match in matches:
                line = match.strip()
                if any(c.isalpha() for c in line):
                    cleaned.append(line)
                if len(cleaned) == 3:
                    break

            if len(cleaned) == 3:
       
                print(f" Stored 3 queries for chunk {record['chunk_id']}")
            else:
                print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")


 20%|██        | 1/5 [00:02<00:11,  2.81s/it]


Raw output for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: arXiv 0704.0001v2 hep ph 24 Jul 2007 ANL HEP PR 07 12, arXiv 0704.0001 Cal ulation of prompt diphoton pro du tion ross se tions at T ev atron and LHC energies C. Bala zs1 , E. L. Berger1 , P . Nadolsky1 , and C. P . Y uan2 1 High Ener gy Physi s Division, A r gonne National L ab or atory, A r gonne, IL 60439 2 Dep artment of Physi s and Astr onomy, Mi higan State University, East L ansing, MI 48824 Dated Ma y 3, 2007 Abstra t A fully di eren tial al ulation in p erturbativ e quan tum hromo dynami s is presen ted for the pro du tion of massiv e photon pairs at hadron olliders. All next to leading order p erturbativ e on tributions from quark an tiquark, gluon an ti quark, and gluon gluon subpro esses are in luded, as w ell as all orders resummation of initial state gluon radiat

 40%|████      | 2/5 [00:05<00:08,  2.96s/it]


Raw output for chunk 0704.0002v2_12 (Paper: 0704.0002v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: to x using this same procedure. Finally, slide pebbles along the path from the original endpoints v to u specified by the successor array s v , s s v , ... The correctness of Algorithm 18 comes from the fact that it is implementing the shortcut construction. Efficiency comes from the fact that instead of potentially moving the pebble back and forth, Algorithm 18 pre computes a canonical path crossing each edge of H at most three times once in the initial depth first search, and twice while converting the initial path to a canonical one. It follows that each accepted edges takes O n time, for a total of O n2 time spent processing edges in H. Although we have not discussed this explicity, for the algorithm to be efficient we need to maintain components as in . After each accept

 60%|██████    | 3/5 [00:08<00:05,  2.93s/it]


Raw output for chunk 0704.0002v2_7 (Paper: 0704.0002v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: w. It follows that any sparse graph has a pebble game construction. Theorem 1 Sparse graphs and pebble game graphs coincide . A graph G is k,l sparse with 0 l 2k 1 if and only if G is a pebble game graph. 6. The pebble game with colors decomposition In this section we prove Theorem 2, which characterizes all pebble game decompositions. We start with the following lemmas about the structure of monochromatic connected components in H, the directed graph maintained during the pebble game. Sparsity certifying Graph Decompositions 9 Lemma 10 Monochromatic pebble game subgraphs are 1,0 sparse . Let Hi be the sub graph of H induced by edges with pebbles of color ci on them. Then Hi is 1,0 sparse, for ,...,k. Proof. By I4 Hi is a set of edges with out degree at most one for every vert

 80%|████████  | 4/5 [00:11<00:02,  2.94s/it]


Raw output for chunk 0704.0003v3_2 (Paper: 0704.0003v3):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: is anomalously high because the tidal force is close to a resonance in the response function of ocean Brush 1983 . Kagan gave a detailed review about those tidal friction models Kagan 1997 . Those models are based on many assumptions about geological continental position and drifting and physical conditions in the past, and many parameters such as phase lag angle, multi mode approximation with time dependent frequencies of the resonance modes, etc. have to be introduced and carefully adjusted to make their predictions close to the geological evidence. However, those assumptions and parameters are still challenged, to certain extent, as concoction. The second possible scenario is that another mechanism could dominate the evolution of the Earth Moon system and the role of the ti

100%|██████████| 5/5 [00:14<00:00,  2.93s/it]


Raw output for chunk 0704.0004v1_2 (Paper: 0704.0004v1):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: the following recurrence relation for the number ak n 2 of acyclic automata of size n on a k letter input alphabet k 1 ak n n 1 X 1 n j 1 n j j 1 k n j ak j , n 1. A source is a vertex with no incoming edges. A finite acyclic automaton has at least one source because a path traversed backward v1 v2 v3 . . . must have distinct vertices and so cannot continue indefinitely. An automaton is single source or initially connected if it has only one source. Let Bk n denote the set of single source acyclic finite SAF automata on a k letter input alphabet with vertices 1, 2, . . ., n 1 where 1 is the source and n 1 is the sink, and set bk n Bk n . The two line representation of an automaton in Bk n is the 2 kn matrix whose columns list the edges in order. For example, ! is in B3 and the




In [19]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 20
"""



df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.95,
            num_return_sequences=1
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    with engine.begin() as connection:
        for record, response in zip(batch, decoded_outputs):
            print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

            cleaned = []
            matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
            for match in matches:
                line = match.strip()
                if any(c.isalpha() for c in line):
                    cleaned.append(line)
                if len(cleaned) == 3:
                    break

            if len(cleaned) == 3:
                connection.execute(text("""
                    UPDATE arxiv_chunks_training_4
                    SET query = :query_data
                    WHERE paper_id = :pid AND chunk_id = :cid
                """), {
                    "query_data": json.dumps(cleaned),
                    "pid": record["paper_id"],
                    "cid": record["chunk_id"]
                })
                print(f" Stored 3 queries for chunk {record['chunk_id']}")
            else:
                print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")


  0%|          | 0/5 [00:00<?, ?it/s]


Raw output for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: arXiv 0704.0001v2 hep ph 24 Jul 2007 ANL HEP PR 07 12, arXiv 0704.0001 Cal ulation of prompt diphoton pro du tion ross se tions at T ev atron and LHC energies C. Bala zs1 , E. L. Berger1 , P . Nadolsky1 , and C. P . Y uan2 1 High Ener gy Physi s Division, A r gonne National L ab or atory, A r gonne, IL 60439 2 Dep artment of Physi s and Astr onomy, Mi higan State University, East L ansing, MI 48824 Dated Ma y 3, 2007 Abstra t A fully di eren tial al ulation in p erturbativ e quan tum hromo dynami s is presen ted for the pro du tion of massiv e photon pairs at hadron olliders. All next to leading order p erturbativ e on tributions from quark an tiquark, gluon an ti quark, and gluon gluon subpro esses are in luded, as w ell as all orders resummation of initial state gluon radiat

 20%|██        | 1/5 [00:07<00:30,  7.60s/it]

 Stored 3 queries for chunk 0704.0001v2_16

Raw output for chunk 0704.0001v2_32 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: . I I B w e in tro du e asymptoti small QT appro ximations for the q q qg and gg gqS NLO ross se tions, Aq q Q, QT, y, X , u,d, d,... i S n QT Fi, Q, y, Fi, Q, y, QT o , B1 34 and Agg Q, QT, y, 1 S g h QT Fg, Q, y, Fg, Q, y, QT i g , F g Q, y, QT . B2 The fun tions F in these equations are de ned as Fi, Q, y, fqi h1 x1, F f qi h2 x2, F 1 2 s h q s h C 1,c qi a fa h1 i x1, F Pqi a fa h1 x1, F ln F Q f qi h2 x2, F fqi h1 x1, F h C 1,c qi a fa h2 i x2, F P qi a fa h2 x2, F ln F Q B3 Fq, 1 2 s fqi h1 x1, F f qi h2 x2, F A 1,c q 1 Q2 T ln Q2 Q2 T B 1,c q 1 Q2 T 1 Q2 T Pqi a fa h1 x1, F f qi h2 x2, F fqi h1 x1, F P qi a fa h2 x2, F B4 Fg, fg h1 x1, F fg h2 x2, F 1 2 s h g s h C 1,c g a fa h1 i x1, F Pg a fa h1 x1, F ln F Q fg h2 x2, F fg h

 40%|████      | 2/5 [00:16<00:25,  8.49s/it]

 Stored 3 queries for chunk 0704.0002v2_6

Raw output for chunk 0704.0002v2_7 (Paper: 0704.0002v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: w. It follows that any sparse graph has a pebble game construction. Theorem 1 Sparse graphs and pebble game graphs coincide . A graph G is k,l sparse with 0 l 2k 1 if and only if G is a pebble game graph. 6. The pebble game with colors decomposition In this section we prove Theorem 2, which characterizes all pebble game decompositions. We start with the following lemmas about the structure of monochromatic connected components in H, the directed graph maintained during the pebble game. Sparsity certifying Graph Decompositions 9 Lemma 10 Monochromatic pebble game subgraphs are 1,0 sparse . Let Hi be the sub graph of H induced by edges with pebbles of color ci on them. Then Hi is 1,0 sparse, for ,...,k. Proof. By I4 Hi is a set of edges 

 60%|██████    | 3/5 [00:25<00:17,  8.74s/it]

 Stored 3 queries for chunk 0704.0003v3_1

Raw output for chunk 0704.0003v3_2 (Paper: 0704.0003v3):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: is anomalously high because the tidal force is close to a resonance in the response function of ocean Brush 1983 . Kagan gave a detailed review about those tidal friction models Kagan 1997 . Those models are based on many assumptions about geological continental position and drifting and physical conditions in the past, and many parameters such as phase lag angle, multi mode approximation with time dependent frequencies of the resonance modes, etc. have to be introduced and carefully adjusted to make their predictions close to the geological evidence. However, those assumptions and parameters are still challenged, to certain extent, as concoction. The second possible scenario is that another mechanism could dominate the evolution of th

 80%|████████  | 4/5 [00:34<00:08,  8.91s/it]

 Stored 3 queries for chunk 0704.0004v1_1

Raw output for chunk 0704.0004v1_2 (Paper: 0704.0004v1):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: the following recurrence relation for the number ak n 2 of acyclic automata of size n on a k letter input alphabet k 1 ak n n 1 X 1 n j 1 n j j 1 k n j ak j , n 1. A source is a vertex with no incoming edges. A finite acyclic automaton has at least one source because a path traversed backward v1 v2 v3 . . . must have distinct vertices and so cannot continue indefinitely. An automaton is single source or initially connected if it has only one source. Let Bk n denote the set of single source acyclic finite SAF automata on a k letter input alphabet with vertices 1, 2, . . ., n 1 where 1 is the source and n 1 is the sink, and set bk n Bk n . The two line representation of an automaton in Bk n is the 2 kn matrix whose columns list the edges

100%|██████████| 5/5 [00:44<00:00,  8.82s/it]

 Stored 3 queries for chunk 0704.0004v1_5





In [20]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 53


In [None]:
import json
import pandas as pd
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)


for _, row in df.iterrows():
    print(f"\n Paper ID: {row['paper_id']}")
    print(f" Chunk ID: {row['chunk_id']}")
    print("Queries:")
    for i, q in enumerate(row["query_list"], 1):
        print(f"  {i}. {q}")


In [None]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 1000
"""



df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.95,
            num_return_sequences=1
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    with engine.begin() as connection:
        for record, response in zip(batch, decoded_outputs):
            print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

            cleaned = []
            matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
            for match in matches:
                line = match.strip()
                if any(c.isalpha() for c in line):
                    cleaned.append(line)
                if len(cleaned) == 3:
                    break

            if len(cleaned) == 3:
                connection.execute(text("""
                    UPDATE arxiv_chunks_training_4
                    SET query = :query_data
                    WHERE paper_id = :pid AND chunk_id = :cid
                """), {
                    "query_data": json.dumps(cleaned),
                    "pid": record["paper_id"],
                    "cid": record["chunk_id"]
                })
                print(f" Stored 3 queries for chunk {record['chunk_id']}")
            else:
                print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")


In [24]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 963


In [28]:
import json
import pandas as pd
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
    ORDER BY chunk_id
    LIMIT 100
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)


for _, row in df.iterrows():
    print(f"\n Paper ID: {row['paper_id']}")
    print(f" Chunk ID: {row['chunk_id']}")
    print("Queries:")
    for i, q in enumerate(row["query_list"], 1):
        print(f"  {i}. {q}")



 Paper ID: 0704.0001v2
 Chunk ID: 0704.0001v2_10
Queries:
  1. What are the effects of resummation in the prediction of physical QT distributions in production?
  2. How does the NLO cross section become unstable in the presence of initial state radiation?
  3. What estimates can be made on the effects of experimental data on the distribution of initial and final state singularities?

 Paper ID: 0704.0001v2
 Chunk ID: 0704.0001v2_11
Queries:
  1. What is the role of photon production within hadron jets in enhancing the inclusive rate?
  2. How does the isolation constraint affect the comparison of cross sections before data?
  3. What are the challenges in reconstructing the effects of isolation on fragmentation contributions?

 Paper ID: 0704.0001v2
 Chunk ID: 0704.0001v2_12
Queries:
  1. What is the role of auxiliary regulator in the approach to simulate experimental isolation?
  2. How does application of the condition to NLO cross section P Q, QT, y maintain the collinear finitene

In [29]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 20
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [8, 6, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.95,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        gc.collect()
        torch.cuda.empty_cache()


Trying batch size 8


  0%|          | 0/3 [00:00<?, ?it/s]


Raw output for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: arXiv 0704.0001v2 hep ph 24 Jul 2007 ANL HEP PR 07 12, arXiv 0704.0001 Cal ulation of prompt diphoton pro du tion ross se tions at T ev atron and LHC energies C. Bala zs1 , E. L. Berger1 , P . Nadolsky1 , and C. P . Y uan2 1 High Ener gy Physi s Division, A r gonne National L ab or atory, A r gonne, IL 60439 2 Dep artment of Physi s and Astr onomy, Mi higan State University, East L ansing, MI 48824 Dated Ma y 3, 2007 Abstra t A fully di eren tial al ulation in p erturbativ e quan tum hromo dynami s is presen ted for the pro du tion of massiv e photon pairs at hadron olliders. All next to leading order p erturbativ e on tributions from quark an tiquark, gluon an ti quark, and gluon gluon subpro esses are in luded, as w ell as all orders resummation of initial state gluon radiat

 33%|███▎      | 1/3 [00:11<00:22, 11.01s/it]

 Stored 3 queries for chunk 0704.0009v1_22

Raw output for chunk 0704.0009v1_23 (Paper: 0704.0009v1):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: 0117561 K 202 11.9 0.6 10.5 0.5 9.98 0.49 11.7 0.6 18.4 1.7 118 18294020 0015131 3.93 0.22 7.40 0.41 13.4 0.7 24.4 1.3 113 10 126 15 119 18294121 0049020 7.47 0.37 6.04 0.29 5.67 0.28 7.37 0.35 7.54 0.71 120 18294124 0047296 0.96 0.05 0.93 0.05 0.88 0.06 0.84 0.06 1.17 0.17 56 Table 2 Continued ID Name Position Prev. Namea 3.6 m 4.5 m 5.8 m 8.0 m 24.0 m 70.0 m SSTc2dJ... mJy mJy mJy mJy mJy mJy 121 18294146 0107380 K 207 95.4 4.8 83.2 4.1 72.2 3.4 81.0 4.3 147 13 122 18294152 0110043 K 210 33.2 1.6 21.2 1.0 16.5 0.8 11.0 0.6 7.69 0.75 123 18294168 0044270 20.6 1.0 16.5 0.8 14.9 0.7 20.1 1.0 40.0 3.7 124 18294216 0120211 K 216 7.27 0.42 6.13 0.30 7.00 0.35 13.0 0.6 19.9 1.9 125 18294392 0107208 K 219 18.4 0.9 17.8 0.9 16.1 0.8 13.3 0.

 67%|██████▋   | 2/3 [00:22<00:11, 11.25s/it]

 Stored 3 queries for chunk 0704.0009v1_32

Raw output for chunk 0704.0009v1_33 (Paper: 0704.0009v1):
You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: 2. K311 18295756 0110471 67.1 3.5 44.7 2.1 28.1 1.4 16.9 0.8 2.00 0.31 K312 EC088 18295758 0113005 66.1 3.2 175 8 251 11 289 15 1990 191 P21 18295764 0110536 181 9 117 5 71.9 3.4 40.9 2.0 4.63 0.48 EC089 P22 K314 EC090 18295772 0114057 2480 189 2970 279 5100 375 5360 384 7860 2350 26000 2440 79 Table 4 Continued Sourcea Spitzer 3.6 m 4.5 m 5.8 m 8.0 m 24 m 70 m SSTc2dJ... mJy mJy mJy mJy mJy mJy P24 K319 EC093 18295780 0115318 65.0 3.2 68.4 3.2 59.3 2.8 52.6 2.6 86.2 9.9 K320 EC091 18295780 0112279 14.1 0.7 20.6 1.0 20.2 1.0 17.2 0.9 22.9 5.0 P23 K317 EC092 18295783 0112514 108 5 167 8 222 10 361 19 2860 285 K318 EC094 18295784 0112378 40.1 2.0 55.9 2.7 61.1 2.9 61.6 3.2 84.2 9.8 EC095 18295789 0112462 131 6 160 7 181 8 180 10 388 50

100%|██████████| 3/3 [00:29<00:00,  9.87s/it]

 Stored 3 queries for chunk 0704.0011v3_8





In [30]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 971


In [None]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 20
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [8, 6, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        gc.collect()
        torch.cuda.empty_cache()


In [None]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 2000
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [8, 6, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        gc.collect()
        torch.cuda.empty_cache()


In [33]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 2698


In [None]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 40
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [8,6, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=100,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        gc.collect()
        torch.cuda.empty_cache()


In [36]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 2698


In [None]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 40
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [6, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=400,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        gc.collect()
        torch.cuda.empty_cache()


In [38]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 2709


In [None]:

query = """
    SELECT paper_id, chunk_id, query, chunk_data
    FROM arxiv_chunks_training_4
    WHERE query IS NULL
       OR LENGTH(query::text) < 1
    ORDER BY chunk_id
    LIMIT 40
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        gc.collect()
        torch.cuda.empty_cache()


In [41]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 2715


In [47]:
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1



"""

df = pd.read_sql(query, engine)
print(df)


             max
0  0704.0339v1_8


In [48]:
query = """
SELECT COUNT(*)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1
  AND chunk_id < '0704.0339v1_8'

"""

df = pd.read_sql(query, engine)
print(df)


   count
0   2714


In [49]:
query = """
SELECT COUNT(*)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1
  AND chunk_id > '0704.0339v1_8'


"""

df = pd.read_sql(query, engine)
print(df)


   count
0      0


In [None]:

query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0339v1_8'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 40

"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [9,8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        gc.collect()
        torch.cuda.empty_cache()


In [51]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 2753


In [52]:
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1



"""

df = pd.read_sql(query, engine)
print(df)


             max
0  0704.0342v1_6


In [53]:
query = """
SELECT COUNT(*)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1
  AND chunk_id < '0704.0342v1_6'

"""

df = pd.read_sql(query, engine)
print(df)


   count
0   2752


In [54]:
query = """
SELECT COUNT(*)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1
  AND chunk_id > '0704.0342v1_6'

"""

df = pd.read_sql(query, engine)
print(df)


   count
0      0


In [None]:

query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0342v1_6'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 20

"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [10,8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        gc.collect()
        torch.cuda.empty_cache()


In [56]:
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1



"""

df = pd.read_sql(query, engine)
print(df)


             max
0  0704.0349v3_2


In [None]:

query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0349v3_2'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 20

"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [12,10,8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        gc.collect()
        torch.cuda.empty_cache()


In [58]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]

print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")


Total chunks with exactly 3 queries: 2790


In [59]:

query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1



"""

df = pd.read_sql(query, engine)
print(df)


             max
0  0704.0350v1_8


In [None]:

query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0350v1_8'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 200

"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [12,10,8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        gc.collect()
        torch.cuda.empty_cache()


In [62]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

Total chunks with exactly 3 queries: 2971
             max
0  0704.0383v1_5


In [None]:
import time

start = time.time()

query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0383v1_5'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 84
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [12, 10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [64]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

Total chunks with exactly 3 queries: 3047
             max
0  0704.0397v2_8


In [None]:
import time

start = time.time()

query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0397v2_8'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 80
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [66]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

Total chunks with exactly 3 queries: 3123
              max
0  0704.0409v2_17


In [None]:
import time

start = time.time()

query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0409v2_17'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 168
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [12,10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [68]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

Total chunks with exactly 3 queries: 3278
             max
0  0704.0436v2_3


In [None]:
import time

start = time.time()

query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0436v2_3'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 432
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [12,10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [70]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

Total chunks with exactly 3 queries: 3669
             max
0  0704.0496v3_6


In [None]:
import time

start = time.time()

query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0496v3_6'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 432
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [16,12,10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [72]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

Total chunks with exactly 3 queries: 4047
             max
0  0704.0552v1_3


In [None]:
import time

start = time.time()

query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0552v1_3'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 432
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [16,12,10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [74]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

Total chunks with exactly 3 queries: 4424
             max
0  0704.0607v2_3


In [None]:
import time
start = time.time()
query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0607v2_3'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 430
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [20,16,12,10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [76]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

Total chunks with exactly 3 queries: 4797
             max
0  0704.0663v1_1


In [None]:
import time
start = time.time()
query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0663v1_1'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 450
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [30,22,20,16,12,10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [78]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

Total chunks with exactly 3 queries: 5197
             max
0  0704.0709v3_7


In [None]:
import time
start = time.time()
query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0709v3_7'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 480
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [24,20,16,12,10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [80]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

Total chunks with exactly 3 queries: 5622
             max
0  0704.0766v2_8


In [None]:
import time
start = time.time()
query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0766v2_8'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 480
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [32,24,20,16,12,10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [None]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)

In [None]:
import time
start = time.time()
query = """
SELECT paper_id, chunk_id, query, chunk_data
FROM arxiv_chunks_training_4
WHERE chunk_id > '0704.0766v2_8'
  AND (query IS NULL OR LENGTH(query::text) < 1)
ORDER BY chunk_id
LIMIT 64
"""

df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_sizes = [32,24,20,16,12,10, 8, 4]

for batch_size in batch_sizes:
    try:
        print(f"Trying batch size {batch_size}")
        for i in tqdm(range(0, len(records), batch_size)):
            batch = records[i:i+batch_size]
            prompts = [
                f"""You are a helpful assistant. Generate exactly 3 concise search queries based on the following academic chunk. 
Format them as a numbered list.

###
Chunk: {r['chunk_data']}
###
Queries:
1."""
                for r in batch
            ]

            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.7,
                    num_return_sequences=1
                )

            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            with engine.begin() as connection:
                for record, response in zip(batch, decoded_outputs):
                    print(f"\nRaw output for chunk {record['chunk_id']} (Paper: {record['paper_id']}):\n{response}")

                    cleaned = []
                    matches = re.findall(r"^\s*(?:\d+[\.\:\-]?)\s+(.*)", response, re.MULTILINE)
                    for match in matches:
                        line = match.strip()
                        if any(c.isalpha() for c in line):
                            cleaned.append(line)
                        if len(cleaned) == 3:
                            break

                    if len(cleaned) == 3:
                        connection.execute(text("""
                            UPDATE arxiv_chunks_training_4
                            SET query = :query_data
                            WHERE paper_id = :pid AND chunk_id = :cid
                        """), {
                            "query_data": json.dumps(cleaned),
                            "pid": record["paper_id"],
                            "cid": record["chunk_id"]
                        })
                        print(f" Stored 3 queries for chunk {record['chunk_id']}")
                    else:
                        print(f" Skipped {record['chunk_id']} — only {len(cleaned)} valid queries")
        break
    except torch.cuda.OutOfMemoryError:
        print(f"\nsize {batch_size}, tryin smaller batch.")
        torch.cuda.empty_cache()

end = time.time()
print(f"Total time taken: {end - start:.2f} seconds")


In [None]:
check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_4
    WHERE query IS NOT NULL
           OR LENGTH(query::text) >1
"""
df = pd.read_sql(check_query, engine)
df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)
df["query_count"] = df["query_list"].apply(len)
three_query_chunks = df[df["query_count"] == 3]
print(f"Total chunks with exactly 3 queries: {len(three_query_chunks)}")
query = """
SELECT MAX(chunk_id)
FROM arxiv_chunks_training_4
WHERE LENGTH(query::text) > 1

"""
df = pd.read_sql(query, engine)
print(df)