In [None]:
!pip install torch torchvision tqdm numpy pandas sqlalchemy


In [2]:
import os
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
from sqlalchemy import create_engine, text
import pandas as pd
import tarfile
import shutil
import re
import unicodedata
from tqdm import tqdm 

In [None]:
!pip install psycopg2-binary


In [21]:
engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.26.75:30002/cleaned_meta_data_db',
    pool_size=10,
    max_overflow=0,
    pool_timeout=30,
)


Showing data with schema

In [None]:
query_preview = "SELECT * FROM arxiv_chunks_training_initial_1 LIMIT 5;"
preview = pd.read_sql(query_preview, engine)
print(" Data:")
print(preview)

In [None]:
query_preview = "SELECT * FROM arxiv_chunks_training_initial LIMIT 5;"
preview = pd.read_sql(query_preview, engine)
print(" Data:")
print(preview)

In [None]:
import json
import pandas as pd

check_query = """
    SELECT paper_id, chunk_id, query
    FROM arxiv_chunks_training_initial_1
    WHERE query IS NOT NULL
"""
df = pd.read_sql(check_query, engine)

df = df[df["query"].str.strip().ne("")]
df["query_list"] = df["query"].apply(json.loads)

for _, row in df.iterrows():
    print(f"\n Paper ID: {row['paper_id']}")
    print(f" Chunk ID: {row['chunk_id']}")
    print("Queries:")
    for i, q in enumerate(row["query_list"], 1):
        print(f"  {i}. {q}")


In [None]:
!pip install transformers sqlalchemy tqdm pandas torch


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sqlalchemy import create_engine, text
from tqdm import tqdm
import pandas as pd
import torch
import json

model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.26.75:30002/cleaned_meta_data_db',
    pool_size=10, max_overflow=0, pool_timeout=30
)

query = """
    SELECT paper_id, chunk_id, chunk_data
    FROM arxiv_chunks_training_initial_1
    ORDER BY chunk_id
    LIMIT 20
"""
df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 8

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"List 3 short search phrases (not questions) that are relevant for this scientific text:\n\n{r['chunk_data']}"
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            num_return_sequences=3,
            do_sample=True,
            temperature=0.95,
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    grouped_outputs = [decoded_outputs[j:j+3] for j in range(0, len(decoded_outputs), 3)]

    with engine.begin() as connection:
        for record, phrases in zip(batch, grouped_outputs):
            print(f"\n Raw model outputs for chunk {record['chunk_id']} (Paper: {record['paper_id']}):")
            for idx, p in enumerate(phrases, 1):
                print(f"[Output {idx}]: {p}")

            cleaned = []
            for phrase in phrases:
                for line in phrase.split("\n"):
                    line = line.strip()
                    if any(c.isalpha() for c in line) and len(line) > 3:
                        cleaned.append(line)
                        break

            if len(cleaned) == 3:
                connection.execute(text("""
                    UPDATE arxiv_chunks_training_initial_1
                    SET query = :query_data
                    WHERE paper_id = :pid AND chunk_id = :cid
                """), {
                    "query_data": json.dumps(cleaned),
                    "pid": record["paper_id"],
                    "cid": record["chunk_id"]
                })
                print(f" Stored  3 clean phrases for chunk {record['chunk_id']}")
            else:
                print(f" Skiped {record['chunk_id']} — only {len(cleaned)} valid phrases")

print(f"\n{len(records)} valid chunks processed — stored 3 clean phrases each (if valid).")


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sqlalchemy import create_engine, text
from tqdm import tqdm
import pandas as pd
import torch
import json

model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

engine = create_engine(
    'postgresql://rg5073:rg5073pass@129.114.26.75:30002/cleaned_meta_data_db',
    pool_size=10, max_overflow=0, pool_timeout=30
)

query = """
    SELECT paper_id, chunk_id, chunk_data
    FROM arxiv_chunks_training_initial_1
    ORDER BY chunk_id
    LIMIT 2000
"""
df = pd.read_sql(query, engine)

records = []
for _, row in df.iterrows():
    chunk = row["chunk_data"]
    if len(chunk.split()) < 30:
        continue
    row["chunk_data"] = chunk
    records.append(row)

batch_size = 8

for i in tqdm(range(0, len(records), batch_size)):
    batch = records[i:i+batch_size]
    prompts = [
        f"List 3 short search phrases (not questions) that are relevant for this scientific text:\n\n{r['chunk_data']}"
        for r in batch
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            num_return_sequences=3,
            do_sample=True,
            temperature=0.95,
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    grouped_outputs = [decoded_outputs[j:j+3] for j in range(0, len(decoded_outputs), 3)]

    with engine.begin() as connection:
        for record, phrases in zip(batch, grouped_outputs):
            print(f"\n Raw model outputs for chunk {record['chunk_id']} (Paper: {record['paper_id']}):")
            for idx, p in enumerate(phrases, 1):
                print(f"[Output {idx}]: {p}")

            cleaned = []
            for phrase in phrases:
                for line in phrase.split("\n"):
                    line = line.strip()
                    if any(c.isalpha() for c in line) and len(line) > 3:
                        cleaned.append(line)
                        break

            if len(cleaned) == 3:
                connection.execute(text("""
                    UPDATE arxiv_chunks_training_initial_1
                    SET query = :query_data
                    WHERE paper_id = :pid AND chunk_id = :cid
                """), {
                    "query_data": json.dumps(cleaned),
                    "pid": record["paper_id"],
                    "cid": record["chunk_id"]
                })
                print(f" Stored  3 clean phrases for chunk {record['chunk_id']}")
            else:
                print(f" Skiped {record['chunk_id']} — only {len(cleaned)} valid phrases")

print(f"\n{len(records)} valid chunks processed — stored 3 clean phrases each (if valid).")
