In [None]:
# LLM HACKATHON 2025
# Data Cleaning Script for Alloy Datasets
# Author: Staradutt
import fitz  # PyMuPDF
import os
import hashlib

# Function to extract raw text from one PDF
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Skipping {pdf_path}: {e}")
        return None  # Return None if opening fails

    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

# Deduplication helper (hash content to detect duplicates)
def get_text_hash(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

# Main loop: load all PDFs into dict
folder = "pdfs/all"  # folder containing your 3000+ open access PDFs
corpus = {}
seen_hashes = set()

for file in os.listdir(folder):
    if file.lower().endswith(".pdf"):
        pdf_path = os.path.join(folder, file)
        text = extract_text_from_pdf(pdf_path)
        if text is None:
            continue  # Skip this file due to error
        text_hash = get_text_hash(text)
        if text_hash not in seen_hashes:  # skip duplicates
            corpus[file] = text
            seen_hashes.add(text_hash)

print(f"Successfully processed {len(corpus)} unique PDFs.")


Skipping pdfs/all\054116_1_5.0137117.pdf: Failed to open file 'pdfs/all\\054116_1_5.0137117.pdf'.
MuPDF error: format error: cannot find page 99 in page tree

MuPDF error: format error: cannot find page 104 in page tree

MuPDF error: format error: cannot find page 108 in page tree

MuPDF error: format error: cannot find page 110 in page tree

MuPDF error: format error: cannot find page 112 in page tree

MuPDF error: format error: cannot find page 112 in page tree

MuPDF error: format error: cannot find page 112 in page tree

MuPDF error: format error: cannot find page 113 in page tree

MuPDF error: format error: cannot find page 113 in page tree

MuPDF error: format error: cannot find page 114 in page tree

MuPDF error: format error: cannot find page 117 in page tree

MuPDF error: format error: cannot find page 122 in page tree

MuPDF error: format error: cannot find page 124 in page tree

MuPDF error: format error: cannot find page 126 in page tree

MuPDF error: format error: cannot f

In [2]:
import re

def extract_sections(text):
    sections = {}
    patterns = {
        "abstract": r"(?i)(abstract)(.*?)(?=\n\s*(introduction|1\.|I\.))",
        "introduction": r"(?i)(introduction)(.*?)(?=\n\s*(methods|experimental|2\.|II\.))",
        "methods": r"(?i)(methods|experimental)(.*?)(?=\n\s*(results|discussion|3\.|III\.))",
        "conclusion": r"(?i)(conclusion|summary)(.*?)(?=\n\s*references|\Z)",
    }
    for name, pattern in patterns.items():
        match = re.search(pattern, text, re.S)
        if match:
            sections[name] = match.group(0)
    return sections

# Example: extract sections for one PDF
sections = extract_sections(list(corpus.values())[0])
print(sections.keys())



dict_keys(['abstract', 'introduction', 'methods', 'conclusion'])


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,   # adjust if needed
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " "]
)

chunks = []
for file, text in corpus.items():
    sections = extract_sections(text)
    for sec_name, sec_text in sections.items():
        # include filename + section for traceability
        split = splitter.split_text(sec_text)
        chunks.extend([f"{file} - {sec_name}: {chunk}" for chunk in split])

print("Total chunks:", len(chunks))


Total chunks: 253733


In [4]:
import pickle

# Save all chunks
with open("chunks_new.pkl", "wb") as f:
    pickle.dump(chunks, f)