In [1]:
!pip install torch torchvision tqdm numpy pandas sqlalchemy
!pip install psycopg2-binary



In [1]:
import os
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
from sqlalchemy import create_engine, text
import pandas as pd
import tarfile
import shutil
import re
import unicodedata
from tqdm import tqdm 

In [2]:
engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.27.190:30002/cleaned_meta_data_db',
    pool_size=10,
    max_overflow=0,
    pool_timeout=30,
)


In [3]:
!pip install transformers accelerate



In [4]:
import accelerate

  from .autonotebook import tqdm as notebook_tqdm


Experiments with differnt Prompts

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sqlalchemy import create_engine, text
from tqdm import tqdm
import pandas as pd
import torch
import json

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.27.190:30002/cleaned_meta_data_db',
    pool_size=10, max_overflow=0, pool_timeout=30
)

query = """
    SELECT paper_id, chunk_id, chunk_data
    FROM arxiv_chunks_training_2
    ORDER BY chunk_id
    LIMIT 20
"""
df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"List 3 short search phrases (not questions) that are relevant for this scientific text:\n\n{r['chunk_data']}"
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            num_return_sequences=3,
            do_sample=True,
            temperature=0.95,
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    grouped_outputs = [decoded_outputs[j:j+3] for j in range(0, len(decoded_outputs), 3)]

    for record, phrases in zip(batch, grouped_outputs):
        print(f"\nRaw model outputs for chunk {record['chunk_id']} (Paper: {record['paper_id']}):")
        for idx, p in enumerate(phrases, 1):
            print(f"[Output {idx}]: {p}")



print(f"\n{len(records)} valid chunks processed — just printed 3 clean phrases each (if valid).")


 20%|██        | 1/5 [00:03<00:12,  3.03s/it]


Raw model outputs for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
[Output 1]: Cal ulation of prompt difoton pro du tion ross se tions at T ev atron and LHC energies
[Output 2]: Cal ulating of prompt diphoton produ tion ross se tions at T ev atron and LHC energies
[Output 3]: Higgs b oson LHC betatrons L H jones - arXiv 0704.0001

Raw model outputs for chunk 0704.0001v2_10 (Paper: 0704.0001v2):
[Output 1]: j l1 p4 q p5
[Output 2]: Quark formation In addition to the QCD singularities asso iated with initial state radiation des rib ed b y the asymptoti terms in Equations and 14 e when a photon is ollinear to the quark
[Output 3]: sales of order Q, the resummed ross se tion b e omes appro ximately equal to the nite order NLO ross se tion, augmen ted t ypi ally

Raw model outputs for chunk 0704.0001v2_11 (Paper: 0704.0001v2):
[Output 1]: Single photon t w o fragmen tation 1 in the soft, or E5 0, limit,
[Output 2]: An infrared safe pro edure an b e form ulated to apply isolation uts at ea h o

 40%|████      | 2/5 [00:05<00:08,  2.91s/it]


Raw model outputs for chunk 0704.0001v2_13 (Paper: 0704.0001v2):
[Output 1]: resummation al ulation, QT 0 limit, pres riptions
[Output 2]: p erturbative function definitions
[Output 3]: L ow Q diphoton fr agmentation an lash of large radiativ e orre tions arises when the in v arian t mass Q is smaller than the transv er er se momen tum QT in b oth the q q qg and gg gqS hannels.

Raw model outputs for chunk 0704.0001v2_14 (Paper: 0704.0001v2):
[Output 1]: Numerical Simulation of Resummation and NLO Roses from qqqg and gg gqS
[Output 2]: Large Hadron Collider (LHC)
[Output 3]: A LHC A LHC A T C G

Raw model outputs for chunk 0704.0001v2_15 (Paper: 0704.0001v2):
[Output 1]: A. Results for Run 2 at the T ev atron 1. Kinemati onstr aints
[Output 2]: p o llider
[Output 3]: A. Results for Run 2 at the T ev atron 1 Kinemati onstr aints

Raw model outputs for chunk 0704.0001v2_16 (Paper: 0704.0001v2):
[Output 1]: d dQ distribution of photon pairs with in v arian mass distribution QT  0
[Output

 60%|██████    | 3/5 [00:08<00:05,  2.59s/it]


Raw model outputs for chunk 0704.0001v2_17 (Paper: 0704.0001v2):
[Output 1]: CDF data a xed order predic tion P dashes and its asymptoti appro ximation A dots b the full resummed ross se tion solid observed
[Output 2]: xed order nnll fixed order
[Output 3]: Resummation of the initial state logarithmi terms renders W nite in the region of small QT

Raw model outputs for chunk 0704.0001v2_18 (Paper: 0704.0001v2):
[Output 1]: DIPHO X o de
[Output 2]: SUM AND EMISSION CHARACTER BLADE 1 SINGLE PICTUREN AND TYP HOUSE BUXTURES PERCENT LEVEL GATE DEFINE TERMINALS FOR HAND AND SHAT INVESTMENT OF STRUCTURE
[Output 3]: fragmen tation of a parton arrying large transv er se momen tum QT in to a system of small in v arian t mass Q 27, 28 , a ligh t pair in our ase

Raw model outputs for chunk 0704.0001v2_19 (Paper: 0704.0001v2):
[Output 1]: ionization energy of a photon frag men tation event of the same type as an element with a
[Output 2]: resum ed qq qg qg DIPHOX qq qg q g
[Output 3]: a term in w

 80%|████████  | 4/5 [00:10<00:02,  2.61s/it]


Raw model outputs for chunk 0704.0001v2_20 (Paper: 0704.0001v2):
[Output 1]: [I I C ]
[Output 2]: fragmen tation on tribution and fa torization se tions
[Output 3]: 

Raw model outputs for chunk 0704.0001v2_21 (Paper: 0704.0001v2):
[Output 1]: pp ollisions
[Output 2]: (PubMed)
[Output 3]: dQT pT to shift to large QT v alues

Raw model outputs for chunk 0704.0001v2_22 (Paper: 0704.0001v2):
[Output 1]: Q e tries of in reased relativized QT in v ariant mass
[Output 2]: et w een the t w o isolated photons to b e ab o v e 0.4. The uts listed ab o v e, optimized for the Higgs b oson sear h, ma y require adjustmen
[Output 3]: resummed QT distributions and aver age tr ansverse momentum ab o v e 0.4.

Raw model outputs for chunk 0704.0001v2_23 (Paper: 0704.0001v2):
[Output 1]: J
[Output 2]: What are the results of DIPHOX and DIPHO X predi tions a t the LHC and compare to the results for their applicability as final state fragmen tation?
[Output 3]: a t l p o p de v  rt del t de pos a l dif er 

100%|██████████| 5/5 [00:12<00:00,  2.43s/it]


Raw model outputs for chunk 0704.0001v2_24 (Paper: 0704.0001v2):
[Output 1]: resummed vl a gqs and resummed NLO gg gqS
[Output 2]: gqS on tribution, resummed al ulation, and DIPHO X at the LHC
[Output 3]: Does resumption of gg gqS in DIPHO X and NLO provide better results on the scalars?

Raw model outputs for chunk 0704.0001v2_25 (Paper: 0704.0001v2):
[Output 1]: ison fusion background at LHC ( )
[Output 2]: T ev atron: The Large Hadron Collider, particle identification , mass and sp er a ence
[Output 3]: helium plasma

Raw model outputs for chunk 0704.0001v2_26 (Paper: 0704.0001v2):
[Output 1]: QCD on tin uum and kinemati s
[Output 2]: kinemati c d for Higgs boson pro du tion
[Output 3]: QCD on tin uum

Raw model outputs for chunk 0704.0001v2_27 (Paper: 0704.0001v2):
[Output 1]: the inverse relation between the signal and ba kground rates, the QT resummed distributions, the fast t w o photons distribution,
[Output 2]: resummed flow-time dis pla y es for the gg ba kground
[Output 3]:




In [10]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()


In [None]:
import gc
import torch

for name in ["model", "tokenizer", "pipe", "generator"]:
    if name in globals():
        del globals()[name]

gc.collect()
torch.cuda.empty_cache()


In [1]:
'model' in globals()


False

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sqlalchemy import create_engine, text
from tqdm import tqdm
import pandas as pd
import torch
import json

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)
model.eval()

engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.27.190:30002/cleaned_meta_data_db',
    pool_size=10, max_overflow=0, pool_timeout=30
)

query = """
    SELECT paper_id, chunk_id, chunk_data
    FROM arxiv_chunks_training_3
    ORDER BY chunk_id
    LIMIT 20
"""
df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]

    prompts = [
        f"List 3 short search phrases (not questions) that are relevant for this scientific text(Please complete words):\n\n{r['chunk_data']}"
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=30,
            num_return_sequences=3,
            do_sample=True,
            temperature=0.95,
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    grouped_outputs = [decoded_outputs[j:j+3] for j in range(0, len(decoded_outputs), 3)]

    for record, phrases in zip(batch, grouped_outputs):
        print(f"\nRaw model outputs for chunk {record['chunk_id']} (Paper: {record['paper_id']}):")
        for idx, p in enumerate(phrases, 1):
            print(f"[Output {idx}]: {p}")

        cleaned = []
        for phrase in phrases:
            for line in phrase.split("\n"):
                line = line.strip()
                if any(c.isalpha() for c in line) and len(line) > 3:
                    cleaned.append(line)
                    break

        if len(cleaned) == 3:
            print(f"Extracted 3 clean phrases for chunk {record['chunk_id']}")
        else:
            print(f"Skipped {record['chunk_id']} — only {len(cleaned)} valid phrases")

print(f"\n{len(records)} valid chunks processed — just printed 3 clean phrases each (if valid).")


 20%|██        | 1/5 [00:01<00:04,  1.24s/it]


Raw model outputs for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
[Output 1]: ar xiv 0704.0001v2 hep ph 24 Jul 2007 HEP PR 07 12, arXiv 07
[Output 2]: CERN, HEP PR 07 12, arXiv 0704.0001
[Output 3]: an eren tial al ulation in p erturbativ e quan tum hromo dyna
Extracted 3 clean phrases for chunk 0704.0001v2_1

Raw model outputs for chunk 0704.0001v2_10 (Paper: 0704.0001v2):
[Output 1]: I n ua tional m ood for prodi tion of ph ysi al
[Output 2]: "Square-expandable quarks and photon acceleration"
[Output 3]: to s ales of order Q, the resummed ross se tion b e omes appro x
Extracted 3 clean phrases for chunk 0704.0001v2_10

Raw model outputs for chunk 0704.0001v2_11 (Paper: 0704.0001v2):
[Output 1]: F eynman | DIPHOST  (DIPHO X )  (DISPO 'S
[Output 2]: Single photon t w o fracmen tation on tributions
[Output 3]: Instability and strain limitations on single fragmen tation single photons with di pherent angular vibration of a
Extracted 3 clean phrases for chunk 0704.0001v2_11

Raw model outp

 40%|████      | 2/5 [00:02<00:03,  1.23s/it]


Raw model outputs for chunk 0704.0001v2_13 (Paper: 0704.0001v2):
[Output 1]: u lt dire t NLO ross se tions o n troll de ployment of the
[Output 2]: W e on lude this se tion b y summarizing the main features of our al ulation
[Output 3]: l ow Q diphoton fr agmentation
Extracted 3 clean phrases for chunk 0704.0001v2_13

Raw model outputs for chunk 0704.0001v2_14 (Paper: 0704.0001v2):
[Output 1]: S de n and the Large Hadron Collider
[Output 2]: T ev atron ollider and then mak e predi tions for the Large Hadron Collider
[Output 3]: a rte Carlo in tegration
Extracted 3 clean phrases for chunk 0704.0001v2_14

Raw model outputs for chunk 0704.0001v2_15 (Paper: 0704.0001v2):
[Output 1]: Results for Run 2 at the T evatron
[Output 2]: Results for Run 2 at the T evatron 1. Kinemati onstr aints
[Output 3]: I. Results for Run 2 at the T evatron 1. Kinemati onstr aints
Extracted 3 clean phrases for chunk 0704.0001v2_15

Raw model outputs for chunk 0704.0001v2_16 (Paper: 0704.0001v2):
[Output 1]: d 

 60%|██████    | 3/5 [00:03<00:02,  1.22s/it]


Raw model outputs for chunk 0704.0001v2_17 (Paper: 0704.0001v2):
[Output 1]: xed order
[Output 2]: QT for small t ea h other
[Output 3]: ESO 479.
Extracted 3 clean phrases for chunk 0704.0001v2_17

Raw model outputs for chunk 0704.0001v2_18 (Paper: 0704.0001v2):
[Output 1]: Expo er imen tal isolation of tributions and the r esitance of a convex parton
[Output 2]: Brut t and bu hannel
[Output 3]: s trategy to estimate theoreti ally resummed logarithmi
Extracted 3 clean phrases for chunk 0704.0001v2_18

Raw model outputs for chunk 0704.0001v2_19 (Paper: 0704.0001v2):
[Output 1]: m s q qg on tribution agrees with the dire t on tribution in DIPHO X
[Output 2]: D nexition pt
[Output 3]:          
Skipped 0704.0001v2_19 — only 2 valid phrases

Raw model outputs for chunk 0704.0001v2_2 (Paper: 0704.0001v2):
[Output 1]: germs for nuclear produ tion for diphotons in strong QCD
[Output 2]: Produ tions to isolated photon pro du tion from the basi short distan e hannels for pro du tion
[Output 3]

 80%|████████  | 4/5 [00:04<00:01,  1.21s/it]


Raw model outputs for chunk 0704.0001v2_20 (Paper: 0704.0001v2):
[Output 1]: resummation formalism, a resummation formalism, the resummation formalism, transv 
[Output 2]: D ep en t syt em for r esum m ation formalism
[Output 3]: F with an err e rissidic energy
Extracted 3 clean phrases for chunk 0704.0001v2_20

Raw model outputs for chunk 0704.0001v2_21 (Paper: 0704.0001v2):
[Output 1]: dQT to shift to larger QT v alues. The shift of the p eak ma y or
[Output 2]: dQT to shift to larger QT v alues. The shift of the p eak ma y or
[Output 3]: dQT to shift to larger QT v alues dQT to shift to larger QT v alu
Extracted 3 clean phrases for chunk 0704.0001v2_21

Raw model outputs for chunk 0704.0001v2_22 (Paper: 0704.0001v2):
[Output 1]: resummed QT distributions
[Output 2]: resummation ef ectives and the qg onnes on tribution
[Output 3]: QCD, and in v arian t mass
Extracted 3 clean phrases for chunk 0704.0001v2_22

Raw model outputs for chunk 0704.0001v2_23 (Paper: 0704.0001v2):
[Output 1]

100%|██████████| 5/5 [00:06<00:00,  1.21s/it]


Raw model outputs for chunk 0704.0001v2_24 (Paper: 0704.0001v2):
[Output 1]: DIPHO X rate at all Q
[Output 2]: The resuming, the DIPHO X and the gamma-ray fa a ture ratio s
[Output 3]: g = [1, 2, 3,5, 6, 8, 7] fgg = 0; g = [10, 20, 11, 11, 2,
Extracted 3 clean phrases for chunk 0704.0001v2_24

Raw model outputs for chunk 0704.0001v2_25 (Paper: 0704.0001v2):
[Output 1]: higgs b oson n rg isotope positon and Higgs b oson
[Output 2]: Higgs bosons signal in quaternary decays = a
[Output 3]: h0 Higgs bi valent Higgs b oson signal distributions at LHC, including the Higgs Ba
Extracted 3 clean phrases for chunk 0704.0001v2_25

Raw model outputs for chunk 0704.0001v2_26 (Paper: 0704.0001v2):
[Output 1]: Higgs b oson  Higgs b oson  QCD on tin uum
[Output 2]: xE1D:Aq2xE*X2CC|xes
[Output 3]: qt distributions
Extracted 3 clean phrases for chunk 0704.0001v2_26

Raw model outputs for chunk 0704.0001v2_27 (Paper: 0704.0001v2):
[Output 1]: Analyzing QT Distributions of the qg Ba kground
[Output 2]: T




In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sqlalchemy import create_engine, text
from tqdm import tqdm
import pandas as pd
import torch
import json

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.27.190:30002/cleaned_meta_data_db',
    pool_size=10, max_overflow=0, pool_timeout=30
)

query = """
    SELECT paper_id, chunk_id, chunk_data
    FROM arxiv_chunks_training_3
    ORDER BY chunk_id
    LIMIT 20
"""
df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]

    prompts = [
        f"List 3 short search phrases (not questions) that are relevant for this scientific text(Please complete words):\n\n{r['chunk_data']}"
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=30,
            num_return_sequences=3,
            do_sample=True,
            temperature=0.95,
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    grouped_outputs = [decoded_outputs[j:j+3] for j in range(0, len(decoded_outputs), 3)]

    for record, phrases in zip(batch, grouped_outputs):
        print(f"\nRaw model outputs for chunk {record['chunk_id']} (Paper: {record['paper_id']}):")
        for idx, p in enumerate(phrases, 1):
            print(f"[Output {idx}]: {p}")

        cleaned = []
        for phrase in phrases:
            for line in phrase.split("\n"):
                line = line.strip()
                if any(c.isalpha() for c in line) and len(line) > 3:
                    cleaned.append(line)
                    break

        if any("arxiv" in p.lower() and any(char.isdigit() for char in p) for p in cleaned):
            print(f"Skipped {record['chunk_id']} — phrase contains arXiv ID")
            continue

        if len(cleaned) == 3:
            print(f"Extracted 3 clean phrases for chunk {record['chunk_id']}")
        else:
            print(f"Skipped {record['chunk_id']} — only {len(cleaned)} valid phrases")

print(f"\n{len(records)} valid chunks processed — just printed 3 clean phrases each (if valid).")


 20%|██        | 1/5 [00:01<00:04,  1.22s/it]


Raw model outputs for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
[Output 1]: LHC ; p.c. - arXiv 0704.0001v2 hep ph 24 Jul 2007
[Output 2]: ANL HEP PR 07 12, arXiv 0704.0001
[Output 3]: CAL ULATION OF PRODU TION OF M ase of DIPOZONES PERFORMED AT CHAL 
Skipped 0704.0001v2_1 — phrase contains arXiv ID

Raw model outputs for chunk 0704.0001v2_10 (Paper: 0704.0001v2):
[Output 1]: QCD singularities: implications for resummation
[Output 2]: In the case of the ollinear val
[Output 3]: To s ales of order Q, the resummed ross se tion b e omes appro 
Extracted 3 clean phrases for chunk 0704.0001v2_10

Raw model outputs for chunk 0704.0001v2_11 (Paper: 0704.0001v2):
[Output 1]: Single photon t w o fragmen tation 1
[Output 2]: Single photon on tribution : F eynman diagrams of one and t w o fragmen 
[Output 3]: s ale s lif fic iion t w o fun tions D z, One hard photo
Extracted 3 clean phrases for chunk 0704.0001v2_11

Raw model outputs for chunk 0704.0001v2_12 (Paper: 0704.0001v2):
[Output 1]: Does

 40%|████      | 2/5 [00:02<00:03,  1.22s/it]


Raw model outputs for chunk 0704.0001v2_13 (Paper: 0704.0001v2):
[Output 1]: T w e op ert ross se tion with f ull dire t nl 
[Output 2]: o pres riptions yield iden ti al predi tions outside of this restri ted region,
[Output 3]: T eoren tal la gmentation leptonics
Extracted 3 clean phrases for chunk 0704.0001v2_13

Raw model outputs for chunk 0704.0001v2_14 (Paper: 0704.0001v2):
[Output 1]: T ev atron Aion
[Output 2]: LHC ellision t e and depolarizing atoms QT i .
[Output 3]: NLO rate from dire t qg and fracmen tation ross se tions in this QT range.
Extracted 3 clean phrases for chunk 0704.0001v2_14

Raw model outputs for chunk 0704.0001v2_15 (Paper: 0704.0001v2):
[Output 1]: A. Results for Run 2 at the T evatron 1 Kinemati onstr aints
[Output 2]: A. Results for Run 2 at the T evatron 1. Kinemati onstr aints
[Output 3]: Recorded results for Run 2 at the T evatron at F ermilab CDF
Extracted 3 clean phrases for chunk 0704.0001v2_15

Raw model outputs for chunk 0704.0001v2_16 (Paper: 070

 60%|██████    | 3/5 [00:03<00:02,  1.25s/it]


Raw model outputs for chunk 0704.0001v2_17 (Paper: 0704.0001v2):
[Output 1]: t ev atron
[Output 2]: rease the rate a
[Output 3]: in on trast to the xed order dashed urv e P in Fig. 5 b , the
Extracted 3 clean phrases for chunk 0704.0001v2_17

Raw model outputs for chunk 0704.0001v2_18 (Paper: 0704.0001v2):
[Output 1]: small Q fragmen tation of this kind is not implemen ty in the theoretial mo dels. Therefore
[Output 2]: We measure the Qs of the largest logarithm logarithm on the tributions Q 27, 28 , a small in
[Output 3]: A description of the formation in a dense rhododendron of small w o logarithmi singularities
Extracted 3 clean phrases for chunk 0704.0001v2_18

Raw model outputs for chunk 0704.0001v2_19 (Paper: 0704.0001v2):
[Output 1]: xed order qq qg on tribution agrees w ell with the dire t on tribution
[Output 2]: DIPHO X qq qg q g qq qg q g  QT 
[Output 3]: Cdf.
Extracted 3 clean phrases for chunk 0704.0001v2_19

Raw model outputs for chunk 0704.0001v2_2 (Paper: 0704.0001v2):

 80%|████████  | 4/5 [00:04<00:01,  1.23s/it]


Raw model outputs for chunk 0704.0001v2_20 (Paper: 0704.0001v2):
[Output 1]: T ev atron data, in fermediate QT asso iated with the frag men tation on
[Output 2]: ii , C . I C . I I C C
[Output 3]: I – C , if Eiso T is in reased in the al ulated from 1 to 4 Ge V
Extracted 3 clean phrases for chunk 0704.0001v2_20

Raw model outputs for chunk 0704.0001v2_21 (Paper: 0704.0001v2):
[Output 1]: dQT to shift to larger QT v alues. The shift of the p
[Output 2]: requ er ive pt det enden es of & #x0a0#/d&#
[Output 3]: pp ollisions at the LH-1
Extracted 3 clean phrases for chunk 0704.0001v2_21

Raw model outputs for chunk 0704.0001v2_22 (Paper: 0704.0001v2):
[Output 1]: C quark to Q = q q and logarithmi terms = QT logarithmic terms
[Output 2]: QCT derivates from the LHC accelerators, as well as the Higgs b oson, in vert er
[Output 3]: QT distributions
Extracted 3 clean phrases for chunk 0704.0001v2_22

Raw model outputs for chunk 0704.0001v2_23 (Paper: 0704.0001v2):
[Output 1]: Examin ment of the

100%|██████████| 5/5 [00:05<00:00,  1.17s/it]


Raw model outputs for chunk 0704.0001v2_24 (Paper: 0704.0001v2):
[Output 1]: gqS on tribution
[Output 2]: resummed, NLO, and DIPHO X rate at all Q
[Output 3]: w the DIPHO X rate at all Q at the LHC
Extracted 3 clean phrases for chunk 0704.0001v2_24

Raw model outputs for chunk 0704.0001v2_25 (Paper: 0704.0001v2):
[Output 1]: Higgs b oson signal and corresponding diphoton background distributions
[Output 2]: b oson and a QCD ba
[Output 3]: Higgs b oson mass , QCD ba kground
Extracted 3 clean phrases for chunk 0704.0001v2_25

Raw model outputs for chunk 0704.0001v2_26 (Paper: 0704.0001v2):
[Output 1]: Inciden tional effect on QT distributions
[Output 2]: -  -      
[Output 3]: Eq erence of QC distributions
Skipped 0704.0001v2_26 — only 2 valid phrases

Raw model outputs for chunk 0704.0001v2_27 (Paper: 0704.0001v2):
[Output 1]: QT distributions for g ba kground pro esses of y 2
[Output 2]: resummed 4T and QT distributions.
[Output 3]: resum ed QT distributions
Extracted 3 clean phrases 




In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sqlalchemy import create_engine, text
from tqdm import tqdm
import pandas as pd
import torch
import json

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.27.190:30002/cleaned_meta_data_db',
    pool_size=10, max_overflow=0, pool_timeout=30
)

query = """
    SELECT paper_id, chunk_id, chunk_data
    FROM arxiv_chunks_training_3
    ORDER BY chunk_id
    LIMIT 20
"""
df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 4

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]

    prompts = [
        f"You are professor,List 3 short search phrases (not questions) that are relevant for this scientific text(Please complete words):\n\n{r['chunk_data']}"
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=30,
            num_return_sequences=3,
            do_sample=True,
            temperature=0.95,
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    grouped_outputs = [decoded_outputs[j:j+3] for j in range(0, len(decoded_outputs), 3)]

    for record, phrases in zip(batch, grouped_outputs):
        print(f"\nRaw model outputs for chunk {record['chunk_id']} (Paper: {record['paper_id']}):")
        for idx, p in enumerate(phrases, 1):
            print(f"[Output {idx}]: {p}")

        cleaned = []
        for phrase in phrases:
            for line in phrase.split("\n"):
                line = line.strip()
                if any(c.isalpha() for c in line) and len(line) > 3:
                    cleaned.append(line)
                    break

        if any("arxiv" in p.lower() and any(char.isdigit() for char in p) for p in cleaned):
            print(f"Skipped {record['chunk_id']} — phrase contains arXiv ID")
            continue

        if len(cleaned) == 3:
            print(f"Extracted 3 clean phrases for chunk {record['chunk_id']}")
        else:
            print(f"Skipped {record['chunk_id']} — only {len(cleaned)} valid phrases")

print(f"\n{len(records)} valid chunks processed — just printed 3 clean phrases each (if valid).")


 20%|██        | 1/5 [00:01<00:04,  1.24s/it]


Raw model outputs for chunk 0704.0001v2_1 (Paper: 0704.0001v2):
[Output 1]: CERN Large Hadron Collider Higgs boson p oel p eak symmetry breaking
[Output 2]: arXiv 0704.0001v2 [Article title] Cal ulation of prompt diphoton produ tion ros
[Output 3]: arXiv 0704.0001v2
Skipped 0704.0001v2_1 — phrase contains arXiv ID

Raw model outputs for chunk 0704.0001v2_10 (Paper: 0704.0001v2):
[Output 1]: quarks and deuterium
[Output 2]:  qg q p1 g p2 p3 p4 q p5 Fig. 1
[Output 3]: h w e r e s e n een
Extracted 3 clean phrases for chunk 0704.0001v2_10

Raw model outputs for chunk 0704.0001v2_11 (Paper: 0704.0001v2):
[Output 1]: X al ulation 14
[Output 2]: Fragmentation Ontribution Fun Ction and Fragmentation
[Output 3]: In trodu e analysis of dissine ts fract entation s ale nly a nl
Extracted 3 clean phrases for chunk 0704.0001v2_11

Raw model outputs for chunk 0704.0001v2_12 (Paper: 0704.0001v2):
[Output 1]: An Il lustration with Light Quotient P A Q A Q, QT, Y, Q, DIPHO ( J.
[Output 2]: Integration

 40%|████      | 2/5 [00:02<00:03,  1.22s/it]


Raw model outputs for chunk 0704.0001v2_13 (Paper: 0704.0001v2):
[Output 1]: L , ow Q diphoton fr agmentation, logarithmi singularity, de s rib 
[Output 2]: Q, QT, q q qg and gg gqS hannes
[Output 3]: p erturbative expression P Q, QT, y, in the q q qg hannels
Extracted 3 clean phrases for chunk 0704.0001v2_13

Raw model outputs for chunk 0704.0001v2_14 (Paper: 0704.0001v2):
[Output 1]: p erfec TION from dire t qg and fragmen tation ross se tions in this
[Output 2]: J en ta tus w ef ter ri te j y v er
[Output 3]: Quantum fluctuation, lhc prediction and QT ross snattering
Extracted 3 clean phrases for chunk 0704.0001v2_14

Raw model outputs for chunk 0704.0001v2_15 (Paper: 0704.0001v2):
[Output 1]: T ev atron Run 2 Attitudes to kertanium emission from T ev atron
[Output 2]: a. Results for Run 2 at the T evatron 1. Kinemati onstr aints
[Output 3]: t
Skipped 0704.0001v2_15 — only 2 valid phrases

Raw model outputs for chunk 0704.0001v2_16 (Paper: 0704.0001v2):
[Output 1]: resum tion on ma

 60%|██████    | 3/5 [00:03<00:02,  1.21s/it]


Raw model outputs for chunk 0704.0001v2_17 (Paper: 0704.0001v2):
[Output 1]: ap eren t resummed W Y on tribution solid
[Output 2]: resummation logarithmi term  y > full resummed wave n logarithmi term
[Output 3]: Xed order predi tion P dashed urv e P b the full resummed ros
Extracted 3 clean phrases for chunk 0704.0001v2_17

Raw model outputs for chunk 0704.0001v2_18 (Paper: 0704.0001v2):
[Output 1]: Indicators on tribulations under oscillation QT Q i n
[Output 2]: SINGLE PHOTON A fragmen tation FUNCE D z
[Output 3]: small q fragmen tation of this kind is not implemen ted y et in theoreti al mo de
Extracted 3 clean phrases for chunk 0704.0001v2_18

Raw model outputs for chunk 0704.0001v2_19 (Paper: 0704.0001v2):
[Output 1]: d m q s l k t q qg L q q qg L q
[Output 2]: xed order q q qg on tribution agrees w ell with the dire t on tribu
[Output 3]: sam e in v arian t mass and transv er se momen tum distributions of pairs
Extracted 3 clean phrases for chunk 0704.0001v2_19

Raw model outpu

 80%|████████  | 4/5 [00:04<00:01,  1.20s/it]


Raw model outputs for chunk 0704.0001v2_20 (Paper: 0704.0001v2):
[Output 1]: resummation formalism
[Output 2]: DIFF e rENCE STRONGLY OBSERVES CONVERSATION TO STRONGNESS IN QT DATA
[Output 3]: I I C
Extracted 3 clean phrases for chunk 0704.0001v2_20

Raw model outputs for chunk 0704.0001v2_21 (Paper: 0704.0001v2):
[Output 1]: in fer a tive to have an in terve ntial study
[Output 2]: dQT to shift to larger QT values. The shift of the p eak ma y or ma
[Output 3]: pp ollision t e e n alertation QTEV QT GeV
Extracted 3 clean phrases for chunk 0704.0001v2_21

Raw model outputs for chunk 0704.0001v2_22 (Paper: 0704.0001v2):
[Output 1]: ref io r o y eu n t o e resummation dep o
[Output 2]: the LHC
[Output 3]: Quarks-proton interactions p T uts
Extracted 3 clean phrases for chunk 0704.0001v2_22

Raw model outputs for chunk 0704.0001v2_23 (Paper: 0704.0001v2):
[Output 1]: In v arian LHC  DIPHO X v ara tional isolation and omp arison With
[Output 2]: Determining qua l state fragmen tation on tri

100%|██████████| 5/5 [00:06<00:00,  1.21s/it]


Raw model outputs for chunk 0704.0001v2_24 (Paper: 0704.0001v2):
[Output 1]: DIPHOX rate at all Q. The largest di eren e o urs at the lo w est v al
[Output 2]: in v arian t mass Q, the transv er er ere momen tum QT , and
[Output 3]: LHC, l eaching and DIPHO X distributions at the LHC, with the. the gg gq
Extracted 3 clean phrases for chunk 0704.0001v2_24

Raw model outputs for chunk 0704.0001v2_25 (Paper: 0704.0001v2):
[Output 1]: LHC physics  higgs boson s ray distributions
[Output 2]: qgs s attering, quantum physics, qt, e sotias f 
[Output 3]: X. ross emin - tation at Mt hp a b y photon fragmen
Extracted 3 clean phrases for chunk 0704.0001v2_25

Raw model outputs for chunk 0704.0001v2_26 (Paper: 0704.0001v2):
[Output 1]: Ontrol of the nal state ollinear on tribution on QT distributions
[Output 2]: f u er s in luding Eiso T and R
[Output 3]: QCD
Skipped 0704.0001v2_26 — only 2 valid phrases

Raw model outputs for chunk 0704.0001v2_27 (Paper: 0704.0001v2):
[Output 1]: A qualitativ e 




In [1]:
'model' in globals()


False

In [2]:
import gc
import torch

for name in ["model", "tokenizer", "pipe", "generator"]:
    if name in globals():
        del globals()[name]

gc.collect()
torch.cuda.empty_cache()
