In [2]:
import os
import numpy as np
from sqlalchemy import create_engine, text
import pandas as pd
import tarfile
import shutil
import re
import unicodedata
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import ARRAY, TEXT
from tqdm import tqdm
from multiprocessing import Pool
import psycopg2
from sqlalchemy import inspect
import json
import random
from collections import defaultdict

In [2]:
engine = create_engine(
    'postgresql+psycopg2://rg5073:rg5073pass@129.114.27.112:5432/cleaned_meta_data_db',
    pool_size=12,
    max_overflow=0,
    pool_timeout=30,
)

In [3]:
create_sql = """
CREATE TABLE IF NOT EXISTS arxiv_metadata_training_data (
    paper_id TEXT,
    chunk_no INT,
    chunk_id TEXT,
    txt_filename TEXT,
    query TEXT,
    chunk_data TEXT,
    query_phrases TEXT
);
"""
with engine.begin() as conn:
    conn.execute(text(create_sql))

In [4]:
training_data_csv_path = os.path.join('/home/jovyan/work', 'arxiv_chunks_training_4_phrases1.csv')
all_data_csv_path = os.path.join('/home/jovyan/work', 'arxiv_cleaned_v5.csv')

In [5]:
df_training_data = pd.read_csv(training_data_csv_path)
print("Our training data contains: ",len(df_training_data), " records")
df_training_data.head()

Our training data contains:  6043  records


Unnamed: 0,paper_id,chunk_no,chunk_id,txt_filename,query,chunk_data,query_phrases
0,0704.0107v1,,0704.0107v1_5,0704.0107v1.txt,"[""What is the Parzen's estimator formula used ...","lim N 1 N N X x qi, , which we consider as the...","[""parzen estimator formula"", ""redundancy numbe..."
1,0704.0107v1,,0704.0107v1_6,0704.0107v1.txt,"[""How to adapt the model Eq. 19 to experimenta...",model relative to experimentally estimated fT ...,"[""adapt model eq"", ""growing pruning methods"", ..."
2,0704.0076v2,,0704.0076v2_12,0704.0076v2.txt,"[""CP asymmetry"", ""Amplitude C and T"", ""SU rela...",the CP asymmetry sum rule predicts ACP B0 K0 0...,"[""cp asymmetry"", ""amplitude"", ""su relation""]"
3,0704.0107v1,,0704.0107v1_7,0704.0107v1.txt,"[""What is the effect of increasing T on the mo...","a new term with the parameters xT , , x g x xT...","[""pdf new experimental"", ""annihilation process..."
4,0704.0674v2,,0704.0674v2_1,0704.0674v2.txt,"[""Galaxy alignment types"", ""Galaxy group catal...",arXiv 0704.0674v2 astro ph 8 Jun 2007 Draft ve...,"[""galaxy alignment types"", ""galaxy group catal..."


In [6]:
# raw_conn = engine.raw_connection()
# cur = raw_conn.cursor()

# with open(training_data_csv_path, 'r') as f:
#     cur.copy_expert(f"""
#         COPY arxiv_metadata_training_data
#         FROM STDIN
#         WITH CSV HEADER
#     """, f)

# raw_conn.commit()
# cur.close()

In [7]:
df_all_data = pd.read_csv(all_data_csv_path)
print("Our all contains: ",len(df_all_data), " records")
# df_all_data.head()


Our all contains:  281931  records


In [8]:
# Drop NaNs and get unique filenames
unique_filenames = df_all_data["txt_filename"].dropna().unique()

# Remove the '.txt' extension and convert to a Python list
unique_paper_ids_all = [os.path.splitext(pid)[0] for pid in unique_filenames]

print(f"{len(unique_paper_ids_all)} unique paper IDs:")
print(unique_paper_ids_all[:5])  # Show first 5


281931 unique paper IDs:
['0704.0001v2', '0704.0002v2', '0704.0003v3', '0704.0004v1', '0704.0005v1']


In [9]:
# Drop NaNs and get unique filenames
unique_filenames_training = df_training_data["txt_filename"].dropna().unique()

# Remove the '.txt' extension and convert to a Python list
unique_paper_ids_training = [os.path.splitext(pid)[0] for pid in unique_filenames_training]

print(f"{len(unique_paper_ids_training)} unique paper IDs:")
print(unique_paper_ids_training[:5])  # Show first 5


812 unique paper IDs:
['0704.0107v1', '0704.0076v2', '0704.0674v2', '0704.0360v1', '0704.0717v2']


In [10]:
unique_paper_ids_training_set = set(unique_paper_ids_training)
unique_paper_ids_all_set = set(unique_paper_ids_all)

# Print counts and preview
print(f"{len(unique_paper_ids_training_set)} unique paper IDs in training set:")
print(list(unique_paper_ids_training_set)[:5])

print(f"{len(unique_paper_ids_all_set)} unique paper IDs in all set:")
print(list(unique_paper_ids_all_set)[:5])

812 unique paper IDs in training set:
['0704.0002v2', '0704.0357v3', '0704.0726v2', '0704.0784v2', '0704.0018v2']
281931 unique paper IDs in all set:
['0808.2912v2', '1108.2604v2', '0712.1697v1', '0807.2824v1', '0809.5023v1']


In [11]:
with open("/home/jovyan/work/data/meta-data/internal-citations.json") as f:
    citation_dict = json.load(f)

In [14]:
def strip_version(paper_id):
    return paper_id.split('v')[0]

# --- Step 1: Create version-less sets and mapping ---
training_ids_base = set()
all_ids_base = set()
base_to_full = defaultdict(list)

# Build sets and mapping
for pid in unique_paper_ids_training:
    base = strip_version(pid)
    training_ids_base.add(base)
    base_to_full[base].append(pid)

for pid in unique_paper_ids_all:
    base = strip_version(pid)
    all_ids_base.add(base)
    base_to_full[base].append(pid)

# --- Step 2: Filter citations using base IDs ---
intermediate_filtered = {
    strip_version(paper): [
        strip_version(cited) for cited in cites
        if strip_version(cited) in all_ids_base and strip_version(cited) not in training_ids_base
    ]
    for paper, cites in citation_dict.items()
    if strip_version(paper) in all_ids_base and strip_version(paper) not in training_ids_base
}

# Drop papers with empty citation lists
intermediate_filtered = {p: c for p, c in intermediate_filtered.items() if c}

# --- Step 3: Map base IDs back to full versions (choose latest version) ---
def get_latest_version(paper_list):
    return sorted(paper_list, key=lambda x: int(x.split('v')[1]), reverse=True)[0]

filtered = {}
for base_paper, base_cites in intermediate_filtered.items():
    full_paper = get_latest_version(base_to_full[base_paper])
    full_cites = [get_latest_version(base_to_full[c]) for c in base_cites]
    filtered[full_paper] = full_cites

# --- Step 4: Convert both mappings to DataFrames ---
df_base = pd.DataFrame(
    [(base_paper, cited) for base_paper, cites in intermediate_filtered.items() for cited in cites],
    columns=["base_paper_id", "cited_base_paper_id"]
)

df_final = pd.DataFrame(
    [(full_paper, cited) for full_paper, cites in filtered.items() for cited in cites],
    columns=["full_paper_id", "cited_full_paper_id"]
)


In [15]:
# Convert to list and print
filtered_list_imm = list(intermediate_filtered.items())
print(f"\nFiltered citation pairs: {len(intermediate_filtered)}\n")

temp_count = 0
for paper, cites in filtered_list_imm:
    print(f"{paper}: {cites}")
    temp_count += 1
    if temp_count > 10:
        break


Filtered citation pairs: 261150

0911.4482: ['0907.4750', '0911.4482']
0911.4991: ['0805.1882', '0911.4991', '0911.4877', '0911.4845', '0911.4881']
0911.5072: ['0910.3220', '0911.5072', '0812.4140', '0803.0982', '0910.0326', '0808.2243']
0911.3671: ['0911.3671']
0911.1402: ['0905.4267', '0911.1401', '0911.1402']
0911.3198: ['0911.3198']
0911.4124: ['0911.4124', '0905.1823', '0812.3471']
0911.1560: ['0906.0302', '0812.1202', '0812.1368', '0809.1229', '0903.3598', '0901.2599', '0812.4265', '0710.5136', '0905.3947', '0801.1817', '0706.1726', '0804.0473', '0911.1560', '0906.4728']
0911.2332: ['0911.2332']
0911.1134: ['0903.1115', '0710.2486', '1001.3884', '0709.0980', '0911.1134', '0711.2741', '0907.1922', '0801.4554', '0908.0857', '0705.4387']
0911.3141: ['0911.3141', '0807.0265']


In [18]:
# Convert to list and print
filtered_list = list(filtered.items())
print(f"\nFiltered citation pairs: {len(filtered_list)}\n")

temp_count = 0
for paper, cites in filtered_list:
    print(f"{paper}: {cites}")
    temp_count += 1
    if temp_count > 10:
        break


Filtered citation pairs: 261150

0911.4482v2: ['0907.4750v1', '0911.4482v2']
0911.4991v3: ['0805.1882v1', '0911.4991v3', '0911.4877v2', '0911.4845v2', '0911.4881v3']
0911.5072v4: ['0910.3220v2', '0911.5072v4', '0812.4140v3', '0803.0982v1', '0910.0326v1', '0808.2243v2']
0911.3671v1: ['0911.3671v1']
0911.1402v2: ['0905.4267v2', '0911.1401v1', '0911.1402v2']
0911.3198v3: ['0911.3198v3']
0911.4124v1: ['0911.4124v1', '0905.1823v1', '0812.3471v3']
0911.1560v1: ['0906.0302v2', '0812.1202v1', '0812.1368v1', '0809.1229v1', '0903.3598v2', '0901.2599v2', '0812.4265v1', '0710.5136v2', '0905.3947v3', '0801.1817v1', '0706.1726v2', '0804.0473v1', '0911.1560v1', '0906.4728v1']
0911.2332v1: ['0911.2332v1']
0911.1134v2: ['0903.1115v2', '0710.2486v1', '1001.3884v1', '0709.0980v1', '0911.1134v2', '0711.2741v2', '0907.1922v2', '0801.4554v2', '0908.0857v2', '0705.4387v2']
0911.3141v1: ['0911.3141v1', '0807.0265v1']


In [96]:
def to_pg_array(lst):
    return "{" + ",".join(f'"{item}"' for item in lst) + "}"

# Create DataFrame for intermediate_filtered and save
df_base = pd.DataFrame(
    [(base_paper, to_pg_array(cited)) for base_paper, cited in filtered_list_imm],
    columns=["base_paper_id", "cited_base_paper_id"]
)
df_base.to_csv("citation_filtered_base.csv", index=False)

# Create DataFrame for filtered and save
df_final = pd.DataFrame(
    [(full_paper, to_pg_array(cites)) for full_paper, cites in filtered_list],
    columns=["full_paper_id", "cited_full_paper_id"]
)
df_final.to_csv("citation_filtered_final.csv", index=False)

print("CSV files saved with correct PostgreSQL array format.")

CSV files saved with correct PostgreSQL array format.


In [97]:
drop_sql_base = "DROP TABLE IF EXISTS citation_filtered_base;"
drop_sql_final = "DROP TABLE IF EXISTS citation_filtered_final;"

create_sql_base = """
CREATE TABLE citation_filtered_base (
    base_paper_id TEXT,
    cited_base_paper_id TEXT []
);
"""

create_sql_final = """
CREATE TABLE citation_filtered_final (
    full_paper_id TEXT,
    cited_full_paper_id TEXT []
);
"""

with engine.begin() as conn:
    conn.execute(text(drop_sql_base))
    conn.execute(text(drop_sql_final))
    conn.execute(text(create_sql_base))
    conn.execute(text(create_sql_final))

In [98]:
raw_conn = engine.raw_connection()
cur = raw_conn.cursor()

with open("citation_filtered_base.csv", 'r') as f:
    cur.copy_expert("""
        COPY citation_filtered_base(base_paper_id, cited_base_paper_id)
        FROM STDIN
        WITH CSV HEADER
    """, f)

with open("citation_filtered_final.csv", 'r') as f:
    cur.copy_expert("""
        COPY citation_filtered_final(full_paper_id, cited_full_paper_id)
        FROM STDIN
        WITH CSV HEADER
    """, f)

raw_conn.commit()
cur.close()
raw_conn.close()

In [24]:
import random
from collections import defaultdict

# ------------------------------------------------------------
#  assume `filtered_ok` already built as before
# ------------------------------------------------------------
TARGET     = 15_000                     # ideal size
TOLERANCE  = 5
MAX_SIZE   = TARGET + TOLERANCE        # 3 005
MIN_SIZE   = TARGET - TOLERANCE        # 2 995

# 1⃣  build Union-Find components (same as before) -------------
parent = {}
def find(x):
    parent.setdefault(x, x)
    if parent[x] != x:
        parent[x] = find(parent[x])
    return parent[x]

def union(a, b):
    pa, pb = find(a), find(b)
    if pa != pb:
        parent[pb] = pa

for paper, refs in filtered_ok.items():
    for r in refs:
        union(paper, r)

components = defaultdict(set)
for node in list(filtered_ok.keys()) + [r for refs in filtered_ok.values() for r in refs]:
    components[find(node)].add(node)

comp_list = list(components.values())
random.Random(42).shuffle(comp_list)    # deterministic shuffle for reproducibility

# 2⃣  fill ONE bucket ------------------------------------------
bucket = set()

for comp in comp_list:
    # quit if bucket already in window
    if MIN_SIZE <= len(bucket) <= MAX_SIZE:
        break

    # add component only if it doesn't overshoot the max
    if len(bucket) + len(comp) <= MAX_SIZE:
        bucket.update(comp)
    # if the component is too large or would overshoot, skip it

print(f"Final bucket size: {len(bucket)} (target window {MIN_SIZE}–{MAX_SIZE})")


Final bucket size: 13629 (target window 14995–15005)


In [36]:
import json, os

# where to save the files
out_dir = "/home/jovyan/work"
os.makedirs(out_dir, exist_ok=True)

# 1️⃣  dump the raw ID list
with open("bucket.json", "w") as f:
    json.dump(list(bucket), f, indent=2)
print(f"✅  wrote {len(bucket):,} IDs")

# 2️⃣  build & dump the citation map
bucket_citations = {pid: filtered_ok.get(pid, []) for pid in bucket}
cit_path = os.path.join(out_dir, "bucket_citations.json")
with open("bucket_citations.json", "w") as f:
    json.dump(bucket_citations, f, indent=2)
print(f"✅  wrote citations for {len(bucket_citations):,} IDs")

combined_obj  = {
    "ids": list(bucket),           # same content as bucket.json
    "citations": bucket_citations  # same content as bucket_citations.json
}

with open("buckets-combined.json", "w") as f:
    json.dump(combined_obj, f, indent=2)

print(f"✅  wrote combined bucket+citations")


✅  wrote 13,629 IDs
✅  wrote citations for 13,629 IDs
✅  wrote combined bucket+citations


In [38]:
create_sql = """
CREATE TABLE IF NOT EXISTS bucket_citations (
    paper_id   TEXT PRIMARY KEY,
    citations  TEXT[] NOT NULL
);
"""

with engine.begin() as conn:
    conn.execute(text(create_sql))
print("table bucket_citations is ready")

with open("bucket_citations.json") as f:
    bucket_citations = json.load(f)

print(f"Loaded {len(bucket_citations):,} records")



table bucket_citations is ready
Loaded 13,629 records


In [39]:
insert_sql = """
INSERT INTO bucket_citations (paper_id, citations)
VALUES (:paper_id, :citations)
ON CONFLICT (paper_id)
DO UPDATE SET citations = EXCLUDED.citations;
"""

payload = [
    {"paper_id": pid, "citations": cites}
    for pid, cites in bucket_citations.items()
]

with engine.begin() as conn:
    conn.execute(text(insert_sql), payload)

print("all rows written to PostgreSQL")

all rows written to PostgreSQL


In [40]:
create_sql = """
CREATE TABLE IF NOT EXISTS arxiv_chunks_eval_data_v1 (
    paper_id TEXT,
    chunk_id TEXT,
    txt_filename TEXT,
    query TEXT,
    chunk_data TEXT,
    paper_cited TEXT[]
);
"""
with engine.begin() as conn:
    conn.execute(text(create_sql))

In [41]:
insert_sql = """
INSERT INTO arxiv_chunks_eval_data_v2 (
    paper_id, chunk_id, txt_filename, query, chunk_data, paper_cited
)
SELECT
    ac.paper_id,
    ac.chunk_id,
    ac.txt_filename,
    ac.query,
    ac.chunk_data,
    bc.citations                       -- TEXT[] from the bucket table
FROM arxiv_chunks_backup     AS ac
JOIN bucket_citations        AS bc   ON bc.paper_id = ac.paper_id;
"""

with engine.begin() as conn:
    conn.execute(text(insert_sql))

print("✅  rows inserted (only bucket papers)")

✅  rows inserted (only bucket papers)
