In [None]:
# runs in jupyter container 
import os
import numpy as np
from sqlalchemy import create_engine, text
import pandas as pd
import tarfile
import shutil
import re
import unicodedata
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import ARRAY, TEXT
from tqdm import tqdm
from multiprocessing import Pool

In [37]:
meta_data_dir = os.getenv("META_DATA_DIR", "/mnt/metadata")
print(os.listdir(meta_data_dir))

['all_files_list.txt', 'metadata', 'raw-data', 'text-files-data']


In [38]:
text_files_data_path = os.path.join(meta_data_dir, "text-files-data")

# List the contents of that subfolder
tar_files_list = os.listdir(text_files_data_path)
print(tar_files_list)

['text_arxiv_0801.tar', 'text_arxiv_0802.tar', 'text_arxiv_0803.tar', 'text_arxiv_0804.tar', 'text_arxiv_0805.tar', 'text_arxiv_0806.tar', 'text_arxiv_0807.tar', 'text_arxiv_0808.tar', 'text_arxiv_0809.tar', 'text_arxiv_0810.tar', 'text_arxiv_0811.tar', 'text_arxiv_0812.tar', 'text_arxiv_0901.tar', 'text_arxiv_0902.tar', 'text_arxiv_0903.tar', 'text_arxiv_0904.tar', 'text_arxiv_0905.tar', 'text_arxiv_0906.tar', 'text_arxiv_0907.tar', 'text_arxiv_0908.tar', 'text_arxiv_0909.tar', 'text_arxiv_0910.tar', 'text_arxiv_0911.tar', 'text_arxiv_0912.tar', 'text_arxiv_1001.tar', 'text_arxiv_1002.tar', 'text_arxiv_1004.tar', 'text_arxiv_1005.tar', 'text_arxiv_1006.tar', 'text_arxiv_1007.tar', 'text_arxiv_1008.tar', 'text_arxiv_1009.tar', 'text_arxiv_1010.tar', 'text_arxiv_1011.tar', 'text_arxiv_1012.tar', 'text_arxiv_1101.tar', 'text_arxiv_1102.tar', 'text_arxiv_1103.tar', 'text_arxiv_1104.tar', 'text_arxiv_1105.tar', 'text_arxiv_1106.tar', 'text_arxiv_1107.tar']


In [51]:
engine = create_engine(
    'postgresql+psycopg2://rg5073:rg5073pass@cleaned_meta_data_postgres:5432/cleaned_meta_data_db',
    pool_size=10,
    max_overflow=0,
    pool_timeout=30,
)

In [52]:
total_files = 0
fail_count = 0

In [53]:
# ‚ö°Ô∏è Load existing filenames (once
def load_existing_pdf_filenames():
    query = text("SELECT txt_filename FROM arxiv_metadata;")
    with engine.connect() as conn:
        result = conn.execute(query)
        pdf_filenames = {row[0] for row in result.fetchall()}
    return pdf_filenames
existing_pdf_filenames = load_existing_pdf_filenames()
print("Total text files to process:", len(existing_pdf_filenames))

Total text files to process: 227494


In [54]:
# ‚≠ê Chunking function
def chunk_text(text, chunk_size_words=650):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size_words):
        chunks.append(" ".join(words[i:i+chunk_size_words]))
    return chunks

In [55]:
def simple_clean_text_remove_references(text):
    # 1. Remove null bytes
    text = text.replace('\x00', '')

    # 2. Normalize Unicode (like Ô¨Å ‚Üí fi)
    text = unicodedata.normalize("NFKD", text)

    # 3. Remove inline LaTeX/math ($...$) and LaTeX commands (\command{...})
    text = re.sub(r'\$.*?\$', ' ', text)                 # Remove math in $
    text = re.sub(r'\\[a-zA-Z]+\{.*?\}', ' ', text)       # Remove \commands{...}
    
    # 4. Remove Unicode math symbols and special symbols (sets, operators, etc)
    text = re.sub(r'[\u2200-\u22FF\u2300-\u23FF]', ' ', text)

    # 5. Remove anything between ‚ü®...‚ü© (angle brackets)
    text = re.sub(r'‚ü®.*?‚ü©', ' ', text)

    # 6. Remove references like [1], [2,5,10]
    text = re.sub(r'\[\d+(,\s*\d+)*\]', ' ', text)

    # 7. Remove numbered equations like (123), (4.5)
    text = re.sub(r'\(\d+(\.\d+)?\)', ' ', text)

    # 8. Remove any remaining weird LaTeX leftovers like {some text}
    text = re.sub(r'\{.*?\}', ' ', text)

    # 9. Remove equations written like "E = mc^2" (detect common formula style)
    text = re.sub(r'([A-Za-z0-9]+\s*[=<>]\s*[A-Za-z0-9^+\-*/\s]+)', ' ', text)

    # 10. Remove all non-ASCII except basic punctuations
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # 11. Remove any special characters except basic word characters and sentence punctuation
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\?\!]', ' ', text)

    # 12. Remove extra hyphenated line breaks
    text = re.sub(r'-\n\s*', '', text)

    # 13. Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # 14. üö® Remove the References section completely
    text = re.split(r'\bReferences\b', text, flags=re.IGNORECASE)[0]

    # 15. Final strip
    return text.strip()

In [56]:
# ‚≠ê Per-file processing (for parallel chunking)
def process_single_file(task):
    file_path, txt_filename = task
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text_content = f.read()
        cleaned_text = simple_clean_text_remove_references(text_content)
        chunks = chunk_text(cleaned_text)
        paper_id = txt_filename.replace(".txt", "")
        return (paper_id, txt_filename, chunks)
    except Exception as e:
        print(f"‚ùå Chunking failed for {txt_filename}: {e}")
        return None

In [57]:
# ‚≠ê Parallel INSERT into arxiv_chunks
def insert_chunks(conn, entries):
    """
    entries: list of (paper_id, txt_filename, chunks_list)
    """
    inserts = []
    for paper_id, txt_filename, chunks in entries:
        for idx, chunk in enumerate(chunks, start=1):
            inserts.append({
                "paper_id": paper_id,
                "chunk_id": idx,
                "txt_filename": txt_filename,
                "query": "",  # optional: you can populate later
                "chunk_data": chunk
            })

    if not inserts:
        return

    insert_stmt = text("""
        INSERT INTO arxiv_chunks (paper_id, chunk_id, txt_filename, query, chunk_data)
        VALUES (:paper_id, :chunk_id, :txt_filename, :query, :chunk_data)
    """)

    batch_size = 500  # Adjust based on memory
    for i in tqdm(range(0, len(inserts), batch_size), desc="Inserting into arxiv_chunks"):
        batch = inserts[i:i+batch_size]
        conn.execute(insert_stmt, batch)

In [58]:
# ‚≠ê Per-tar processing
def process_tar_file(tar_filename):
    global total_files, fail_count

    tar_path = os.path.join(text_files_data_path, tar_filename)

    if not os.path.exists(tar_path):
        print(f"‚ùå Tar file not found: {tar_path}")
        return

    print(f"\nüì¶ Processing tar: {tar_filename}...")

    with tarfile.open(tar_path, "r") as tar:
        tar.extractall(path=workspace_dir)

    extracted_folder_name = tar_filename.replace(".tar", "")
    extracted_folder_path = os.path.join(workspace_dir, extracted_folder_name)

    if not os.path.exists(extracted_folder_path):
        print(f"‚ùå Extracted folder missing: {extracted_folder_path}")
        return

    print(f"üîç Extracted to: {extracted_folder_path}")

    txt_files_list = os.listdir(extracted_folder_path)
    print(f"üìÑ Found {len(txt_files_list)} text files.")

    tasks = []
    for filename in txt_files_list:
        if filename.endswith(".txt") and filename in existing_pdf_filenames:
            file_path = os.path.join(extracted_folder_path, filename)
            tasks.append((file_path, filename))

    if not tasks:
        print(f"‚ö†Ô∏è No matching text files found in {tar_filename}")
        shutil.rmtree(extracted_folder_path)
        return

    # ‚≠ê Process all files in parallel
    with Pool(processes=8) as pool:
        results = list(tqdm(
            pool.imap_unordered(process_single_file, tasks),
            total=len(tasks),
            desc=f"Chunking {tar_filename}",
            dynamic_ncols=True
        ))

    processed_entries = [r for r in results if r is not None]

    print(f"‚úÖ Processed {len(processed_entries)} / {len(tasks)} files ready for DB insert.")

    with engine.begin() as conn:
        try:
            insert_chunks(conn, processed_entries)
            total_files += len(processed_entries)
        except Exception as e:
            fail_count += len(processed_entries)
            print(f"‚ùå Insert failed for {tar_filename}: {e}")

    shutil.rmtree(extracted_folder_path)
    print(f"üßπ Deleted extracted folder: {extracted_folder_path}")

In [59]:
for tar_filename in tar_files_list:
    process_tar_file(tar_filename)

print("\nüéâ All tar files processed!")
print(f"üî• Total papers inserted: {total_files}")
print(f"‚ùó Total failed inserts: {fail_count}")


üì¶ Processing tar: text_arxiv_0801.tar...


  tar.extractall(path=workspace_dir)


üîç Extracted to: /home/jovyan/work/text_arxiv_0801
üìÑ Found 7683 text files.


Chunking text_arxiv_0801.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4935/4935 [00:08<00:00, 584.85it/s]


‚úÖ Processed 4935 / 4935 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:11<00:00,  7.41it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0801

üì¶ Processing tar: text_arxiv_0802.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0802
üìÑ Found 6934 text files.


Chunking text_arxiv_0802.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4431/4431 [00:07<00:00, 579.09it/s]


‚úÖ Processed 4431 / 4431 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 78/78 [00:10<00:00,  7.42it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0802

üì¶ Processing tar: text_arxiv_0803.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0803
üìÑ Found 7123 text files.


Chunking text_arxiv_0803.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4489/4489 [00:08<00:00, 558.66it/s]


‚úÖ Processed 4489 / 4489 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 80/80 [00:10<00:00,  7.59it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0803

üì¶ Processing tar: text_arxiv_0804.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0804
üìÑ Found 7748 text files.


Chunking text_arxiv_0804.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4862/4862 [00:08<00:00, 571.02it/s]


‚úÖ Processed 4862 / 4862 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 90/90 [00:11<00:00,  7.79it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0804

üì¶ Processing tar: text_arxiv_0805.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0805
üìÑ Found 7564 text files.


Chunking text_arxiv_0805.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4805/4805 [00:08<00:00, 573.08it/s]


‚úÖ Processed 4805 / 4805 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:10<00:00,  7.67it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0805

üì¶ Processing tar: text_arxiv_0806.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0806
üìÑ Found 7996 text files.


Chunking text_arxiv_0806.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4936/4936 [00:08<00:00, 576.70it/s]


‚úÖ Processed 4936 / 4936 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 85/85 [00:11<00:00,  7.36it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0806

üì¶ Processing tar: text_arxiv_0807.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0807
üìÑ Found 8193 text files.


Chunking text_arxiv_0807.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5090/5090 [00:09<00:00, 536.47it/s]


‚úÖ Processed 5090 / 5090 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 89/89 [00:12<00:00,  7.42it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0807

üì¶ Processing tar: text_arxiv_0808.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0808
üìÑ Found 6551 text files.


Chunking text_arxiv_0808.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4143/4143 [00:07<00:00, 558.05it/s]


‚úÖ Processed 4143 / 4143 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 74/74 [00:09<00:00,  7.73it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0808

üì¶ Processing tar: text_arxiv_0809.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0809
üìÑ Found 8427 text files.


Chunking text_arxiv_0809.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5254/5254 [00:09<00:00, 564.90it/s]


‚úÖ Processed 5254 / 5254 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 93/93 [00:13<00:00,  7.06it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0809

üì¶ Processing tar: text_arxiv_0810.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0810
üìÑ Found 8807 text files.


Chunking text_arxiv_0810.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5736/5736 [00:09<00:00, 584.67it/s]


‚úÖ Processed 5736 / 5736 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 97/97 [00:12<00:00,  7.48it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0810

üì¶ Processing tar: text_arxiv_0811.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0811
üìÑ Found 7451 text files.


Chunking text_arxiv_0811.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4739/4739 [00:08<00:00, 576.40it/s]


‚úÖ Processed 4739 / 4739 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 82/82 [00:10<00:00,  7.57it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0811

üì¶ Processing tar: text_arxiv_0812.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0812
üìÑ Found 8161 text files.


Chunking text_arxiv_0812.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5087/5087 [00:08<00:00, 568.64it/s]


‚úÖ Processed 5087 / 5087 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 91/91 [00:12<00:00,  7.43it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0812

üì¶ Processing tar: text_arxiv_0901.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0901
üìÑ Found 8000 text files.


Chunking text_arxiv_0901.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4931/4931 [00:08<00:00, 571.15it/s]


‚úÖ Processed 4931 / 4931 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 85/85 [00:11<00:00,  7.27it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0901

üì¶ Processing tar: text_arxiv_0902.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0902
üìÑ Found 7665 text files.


Chunking text_arxiv_0902.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4865/4865 [00:08<00:00, 569.38it/s]


‚úÖ Processed 4865 / 4865 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:11<00:00,  7.46it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0902

üì¶ Processing tar: text_arxiv_0903.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0903
üìÑ Found 8974 text files.


Chunking text_arxiv_0903.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5506/5506 [00:09<00:00, 550.61it/s]


‚úÖ Processed 5506 / 5506 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 102/102 [00:13<00:00,  7.43it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0903

üì¶ Processing tar: text_arxiv_0904.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0904
üìÑ Found 7897 text files.


Chunking text_arxiv_0904.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4899/4899 [00:08<00:00, 547.22it/s]


‚úÖ Processed 4899 / 4899 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 88/88 [00:12<00:00,  7.32it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0904

üì¶ Processing tar: text_arxiv_0905.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0905
üìÑ Found 7874 text files.


Chunking text_arxiv_0905.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4912/4912 [00:08<00:00, 558.30it/s]


‚úÖ Processed 4912 / 4912 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 87/87 [00:11<00:00,  7.56it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0905

üì¶ Processing tar: text_arxiv_0906.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0906
üìÑ Found 8864 text files.


Chunking text_arxiv_0906.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5578/5578 [00:10<00:00, 541.69it/s]


‚úÖ Processed 5578 / 5578 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 101/101 [00:13<00:00,  7.32it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0906

üì¶ Processing tar: text_arxiv_0907.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0907
üìÑ Found 9076 text files.


Chunking text_arxiv_0907.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5563/5563 [00:10<00:00, 553.26it/s]


‚úÖ Processed 5563 / 5563 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 98/98 [00:13<00:00,  7.42it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0907

üì¶ Processing tar: text_arxiv_0908.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0908
üìÑ Found 7279 text files.


Chunking text_arxiv_0908.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4559/4559 [00:08<00:00, 552.28it/s]


‚úÖ Processed 4559 / 4559 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 81/81 [00:10<00:00,  7.38it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0908

üì¶ Processing tar: text_arxiv_0909.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0909
üìÑ Found 8725 text files.


Chunking text_arxiv_0909.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5656/5656 [00:10<00:00, 532.81it/s]


‚úÖ Processed 5656 / 5656 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 98/98 [00:12<00:00,  7.59it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0909

üì¶ Processing tar: text_arxiv_0910.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0910
üìÑ Found 9162 text files.


Chunking text_arxiv_0910.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5907/5907 [00:10<00:00, 569.43it/s]


‚úÖ Processed 5907 / 5907 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 102/102 [00:13<00:00,  7.49it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0910

üì¶ Processing tar: text_arxiv_0911.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0911
üìÑ Found 9115 text files.


Chunking text_arxiv_0911.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5685/5685 [00:10<00:00, 559.61it/s]


‚úÖ Processed 5685 / 5685 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:13<00:00,  7.21it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0911

üì¶ Processing tar: text_arxiv_0912.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_0912
üìÑ Found 8792 text files.


Chunking text_arxiv_0912.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5477/5477 [00:10<00:00, 543.88it/s]


‚úÖ Processed 5477 / 5477 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 97/97 [00:12<00:00,  7.69it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_0912

üì¶ Processing tar: text_arxiv_1001.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1001
üìÑ Found 8541 text files.


Chunking text_arxiv_1001.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5423/5423 [00:09<00:00, 559.57it/s]


‚úÖ Processed 5423 / 5423 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 91/91 [00:12<00:00,  7.49it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1001

üì¶ Processing tar: text_arxiv_1002.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1002
üìÑ Found 7982 text files.


Chunking text_arxiv_1002.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4987/4987 [00:09<00:00, 544.49it/s]


‚úÖ Processed 4987 / 4987 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 90/90 [00:11<00:00,  7.56it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1002

üì¶ Processing tar: text_arxiv_1004.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1004
üìÑ Found 8955 text files.


Chunking text_arxiv_1004.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5543/5543 [00:10<00:00, 511.54it/s]


‚úÖ Processed 5543 / 5543 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 101/101 [00:13<00:00,  7.71it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1004

üì¶ Processing tar: text_arxiv_1005.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1005
üìÑ Found 9192 text files.


Chunking text_arxiv_1005.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5683/5683 [00:10<00:00, 549.76it/s]


‚úÖ Processed 5683 / 5683 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 99/99 [00:12<00:00,  7.69it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1005

üì¶ Processing tar: text_arxiv_1006.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1006
üìÑ Found 9428 text files.


Chunking text_arxiv_1006.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5877/5877 [00:10<00:00, 549.60it/s]


‚úÖ Processed 5877 / 5877 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:13<00:00,  7.56it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1006

üì¶ Processing tar: text_arxiv_1007.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1007
üìÑ Found 8817 text files.


Chunking text_arxiv_1007.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5462/5462 [00:09<00:00, 552.11it/s]


‚úÖ Processed 5462 / 5462 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 98/98 [00:13<00:00,  7.22it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1007

üì¶ Processing tar: text_arxiv_1008.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1008
üìÑ Found 8633 text files.


Chunking text_arxiv_1008.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5337/5337 [00:09<00:00, 543.09it/s]


‚úÖ Processed 5337 / 5337 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 96/96 [00:12<00:00,  7.41it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1008

üì¶ Processing tar: text_arxiv_1009.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1009
üìÑ Found 9724 text files.


Chunking text_arxiv_1009.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6172/6172 [00:11<00:00, 540.62it/s]


‚úÖ Processed 6172 / 6172 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 111/111 [00:14<00:00,  7.70it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1009

üì¶ Processing tar: text_arxiv_1010.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1010
üìÑ Found 9806 text files.


Chunking text_arxiv_1010.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6212/6212 [00:11<00:00, 544.84it/s]


‚úÖ Processed 6212 / 6212 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 110/110 [00:14<00:00,  7.45it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1010

üì¶ Processing tar: text_arxiv_1011.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1011
üìÑ Found 10562 text files.


Chunking text_arxiv_1011.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6606/6606 [00:12<00:00, 539.35it/s]


‚úÖ Processed 6606 / 6606 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 118/118 [00:14<00:00,  7.87it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1011

üì¶ Processing tar: text_arxiv_1012.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1012
üìÑ Found 9581 text files.


Chunking text_arxiv_1012.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5976/5976 [00:10<00:00, 551.83it/s]


‚úÖ Processed 5976 / 5976 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 105/105 [00:14<00:00,  7.43it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1012

üì¶ Processing tar: text_arxiv_1101.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1101
üìÑ Found 9775 text files.


Chunking text_arxiv_1101.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6012/6012 [00:11<00:00, 543.76it/s]


‚úÖ Processed 6012 / 6012 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 106/106 [00:14<00:00,  7.40it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1101

üì¶ Processing tar: text_arxiv_1102.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1102
üìÑ Found 9227 text files.


Chunking text_arxiv_1102.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5715/5715 [00:10<00:00, 549.99it/s]


‚úÖ Processed 5715 / 5715 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:13<00:00,  7.50it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1102

üì¶ Processing tar: text_arxiv_1103.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1103
üìÑ Found 10151 text files.


Chunking text_arxiv_1103.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6215/6215 [00:11<00:00, 538.39it/s]


‚úÖ Processed 6215 / 6215 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 114/114 [00:15<00:00,  7.59it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1103

üì¶ Processing tar: text_arxiv_1104.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1104
üìÑ Found 9289 text files.


Chunking text_arxiv_1104.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5652/5652 [00:10<00:00, 521.80it/s]


‚úÖ Processed 5652 / 5652 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 104/104 [00:14<00:00,  7.18it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1104

üì¶ Processing tar: text_arxiv_1105.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1105
üìÑ Found 10321 text files.


Chunking text_arxiv_1105.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6311/6311 [00:11<00:00, 548.00it/s]


‚úÖ Processed 6311 / 6311 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 113/113 [00:15<00:00,  7.30it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1105

üì¶ Processing tar: text_arxiv_1106.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1106
üìÑ Found 10267 text files.


Chunking text_arxiv_1106.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6280/6280 [00:12<00:00, 517.06it/s]


‚úÖ Processed 6280 / 6280 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 117/117 [00:16<00:00,  7.12it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1106

üì¶ Processing tar: text_arxiv_1107.tar...
üîç Extracted to: /home/jovyan/work/text_arxiv_1107
üìÑ Found 9756 text files.


Chunking text_arxiv_1107.tar: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5986/5986 [00:11<00:00, 516.68it/s]


‚úÖ Processed 5986 / 5986 files ready for DB insert.


Inserting into arxiv_chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 111/111 [00:15<00:00,  7.31it/s]


üßπ Deleted extracted folder: /home/jovyan/work/text_arxiv_1107

üéâ All tar files processed!
üî• Total papers inserted: 227494
‚ùó Total failed inserts: 0


In [63]:
query_count = "SELECT COUNT(*) FROM arxiv_chunks;"
count = pd.read_sql(query_count, engine)
print("‚úÖ Total rows in arxiv_chunks:", count.iloc[0, 0])

‚úÖ Total rows in arxiv_chunks: 2008087


In [64]:
query_preview = "SELECT * FROM arxiv_chunks LIMIT 5;"
preview = pd.read_sql(query_preview, engine)
print("‚úÖ Preview of data:")
print(preview)

‚úÖ Preview of data:
      paper_id  chunk_id     txt_filename query  \
0  0801.3173v1         1  0801.3173v1.txt         
1  0801.3173v1         2  0801.3173v1.txt         
2  0801.1596v1         1  0801.1596v1.txt         
3  0801.1596v1         2  0801.1596v1.txt         
4  0801.1596v1         3  0801.1596v1.txt         

                                                                                                                                                                                                chunk_data  
0  arXiv 0801.3173v1 astro ph 21 Jan 2008 The impact of encounters on the members of Local Group Analogs. A view from GALEX Buson, L. M.1, Bettoni, D.1, Bianchi, L.2, Buzzoni, A.3, Marino, A.1 and Ra...  
1  c? 14.18 1101 18.01 0.01 18.51 0.03 0.50 0.03 NGC 3455 R SAB rs b 12.83 1102 14.719 0.01 NGC 3507 SB s b 11.73 979 16.90 0.03 18.21 0.08 1.31 0.09 UGC 5947 Im pec. 14.75 1251 UGC 6035 IBm 14.30 10...  
2  arXiv 0801.1596v1 cond mat.mes hall 10 Jan 2008 Transp

In [62]:
query_nulls = """
SELECT COUNT(*) 
FROM arxiv_metadata 
WHERE id IS NULL OR txt_filename IS NULL;
"""
nulls = pd.read_sql(query_nulls, engine)
print("‚úÖ Rows with NULL id or txt_filename:", nulls.iloc[0, 0])

‚úÖ Rows with NULL id or txt_filename: 0


In [20]:
paper_id = "0801.0001v1"

query = f"""
SELECT paper_id, chunk_id, txt_filename, query, chunk_data
FROM arxiv_chunks
WHERE paper_id = '{paper_id}'
ORDER BY chunk_id
LIMIT 10;
"""

# üîç Load into a DataFrame
df = pd.read_sql(query, engine)

# üñºÔ∏è Display nicely
pd.set_option('display.max_colwidth', 200)  # So chunk_data isn't truncated badly
df

Unnamed: 0,paper_id,chunk_id,txt_filename,query,chunk_data
0,0801.0001v1,1,0801.0001v1.txt,,"arXiv:0801.0001v1 [math.NT] 2 Jan 2008 LINEAR FORMS AND COMPLEMENTING SETS OF INTEGERS MELVYN B. NATHANSON Abstract. Let œÜ(x1, . . . , xh, y) = u1x1 + ¬∑ ¬∑ ¬∑ + uhxh + vy be a linear form with nonze..."
1,0801.0001v1,2,0801.0001v1.txt,,", ah, b1, . . . , bl) A1 √ó ¬∑ ¬∑ ¬∑ √ó Ah √ó B1 √ó ¬∑ ¬∑ ¬∑ √ó Bl: œÜ(a1, . . . , ah, b1, . . . , bl) n (mod m)}) . If l= 1 and B = (B), then we write œÜ(A, B) = œÜ(A, B), R(œÜ) A,B(n) = R(œÜ) A,B(n), and R(œÜ) A..."
2,0801.0001v1,3,0801.0001v1.txt,,"case l= 1. Suppose that œÜ(x1, . . . , xh, y) = œà(x1, . . . , xh)+vy is a linear form with nonzero integer coefficients, and that A is an h-tuple of finite sets of integers and B is a set of intege..."
3,0801.0001v1,4,0801.0001v1.txt,,"< œà(a1, . . . , ah) gmax for all h-tuples (a1, . . . , ah) /Gmin, it follows that 0 < 1 v œà(a1, . . . , ah) gmin uh gmax gmin uh . Similarly, replacing n by vn + gmax, we obtain the identity |Gmax..."
4,0801.0001v1,5,0801.0001v1.txt,,"l=0 X iIl R(œà) A (l+ im)zl+im. Since zLF(z) = m1 X l=0 X iIl R(œà) A (l+ im)zl+L+im is a polynomial, it follows that l+ L + im 0 for all l{0, 1, . . ., m 1} and i Il. Applying the division algorith..."
5,0801.0001v1,6,0801.0001v1.txt,,"LN N for all N 1, we can assume without loss of generality that LN = N. Consider the linear form œà(x1, . . . , xh) = u1x1 + ¬∑ ¬∑ ¬∑ + uhxh Then œÜ(a1, . . . , ah, b) = œà(a1, . . . , a) + vb for all i..."
6,0801.0001v1,7,0801.0001v1.txt,,"n IN. Then there exists a set B such that RA,B(n) = t for all n Z. Proof. For every integer N 1, there is an integer cN such that IN = [cN LN, cN + LN] Z. Replace the set BN with the set BN cN and..."
