# Milestone 2 — PDF Text Extraction, Section-wise Parsing & Cross-Paper Analysis

**Weeks 3–4 Objective:** Build on Milestone 1 outputs (downloaded PDFs + metadata) to extract text, organize section-wise content, perform key-finding extraction, compare across papers, and validate extraction quality.


## Cell 1 — Install dependencies

In [None]:

!pip install --upgrade pip
!pip install PyMuPDF pandas tqdm scikit-learn nltk
print("Dependencies installed.")


Dependencies installed.


## Cell 2 — Imports and folder setup

In [None]:

import fitz  # PyMuPDF
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')

# Paths (continuing from Milestone 1)
ROOT_DIR = Path("/content/semantic_scholar_results")
PAPERS_DIR = ROOT_DIR / "papers"
TEXT_DIR = ROOT_DIR / "extracted_text"
SECTION_DIR = ROOT_DIR / "sectioned_text"

TEXT_DIR.mkdir(exist_ok=True)
SECTION_DIR.mkdir(exist_ok=True)

print("PDF source:", PAPERS_DIR)
print("Text output:", TEXT_DIR)
print("Section-wise output:", SECTION_DIR)


PDF source: /content/semantic_scholar_results/papers
Text output: /content/semantic_scholar_results/extracted_text
Section-wise output: /content/semantic_scholar_results/sectioned_text


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Cell 3 — PDF text extraction module

In [None]:

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text.strip()

# Extract text from all PDFs
records = []
for pdf in tqdm(PAPERS_DIR.glob("*.pdf"), desc="Extracting text"):
    text = extract_text_from_pdf(pdf)
    out_file = TEXT_DIR / f"{pdf.stem}.txt"
    out_file.write_text(text, encoding="utf-8")
    records.append({
        "paper": pdf.name,
        "text_length": len(text)
    })

df_text = pd.DataFrame(records)
df_text


Extracting text: 0it [00:00, ?it/s]

## Cell 4 — Section-wise text extraction

In [None]:

SECTION_HEADERS = [
    "abstract", "introduction", "related work", "methodology",
    "methods", "results", "discussion", "conclusion", "future work"
]

def split_into_sections(text):
    sections = {}
    text_lower = text.lower()
    for header in SECTION_HEADERS:
        pattern = rf"{header}\n"
        matches = list(re.finditer(pattern, text_lower))
        if matches:
            start = matches[0].end()
            end = len(text)
            sections[header] = text[start:end].strip()
    return sections

section_records = []

for txt_file in tqdm(TEXT_DIR.glob("*.txt"), desc="Sectioning"):
    text = txt_file.read_text(encoding="utf-8", errors="ignore")
    sections = split_into_sections(text)
    for sec, content in sections.items():
        sec_file = SECTION_DIR / f"{txt_file.stem}_{sec}.txt"
        sec_file.write_text(content, encoding="utf-8")
        section_records.append({
            "paper": txt_file.stem,
            "section": sec,
            "length": len(content)
        })

df_sections = pd.DataFrame(section_records)
df_sections


Sectioning: 0it [00:00, ?it/s]

## Cell 5 — Key-finding extraction logic (TF-IDF)

In [None]:
# Cell 5: Keyword extraction using TF-IDF (ROBUST VERSION)

def clean_text(text):
    # keep only alphabetic characters and spaces
    text = re.sub(r"[^a-zA-Z ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip().lower()

def extract_keywords(texts, top_k=10):
    # Clean texts and remove very small ones
    cleaned_texts = []
    for t in texts:
        ct = clean_text(t)
        if len(ct.split()) > 20:   # minimum meaningful length
            cleaned_texts.append(ct)

    if len(cleaned_texts) == 0:
        print("⚠️ No valid text available for keyword extraction.")
        return []

    vectorizer = TfidfVectorizer(
        stop_words="english",
        max_features=500
    )

    try:
        tfidf = vectorizer.fit_transform(cleaned_texts)
        features = vectorizer.get_feature_names_out()
        scores = tfidf.mean(axis=0).A1
        keywords = sorted(zip(features, scores), key=lambda x: x[1], reverse=True)
        return keywords[:top_k]
    except ValueError:
        print("⚠️ TF-IDF failed due to empty vocabulary.")
        return []

# Collect paper texts safely
paper_texts = []
paper_names = []

for txt in TEXT_DIR.glob("*.txt"):
    content = txt.read_text(encoding="utf-8", errors="ignore")
    if len(content.strip()) > 100:   # ignore empty extractions
        paper_texts.append(content)
        paper_names.append(txt.stem)

global_keywords = extract_keywords(paper_texts, top_k=15)

keywords_df = pd.DataFrame(global_keywords, columns=["keyword", "score"])
keywords_df


⚠️ No valid text available for keyword extraction.


Unnamed: 0,keyword,score


## Cell 6 — Cross-paper comparison module

In [None]:
comparison_data = []

for paper, text in zip(paper_names, paper_texts):
    kws = extract_keywords([text], top_k=10)
    comparison_data.append({
        "paper": paper,
        "keywords": [k for k, _ in kws]
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df


## Cell 7 — Validation of correctness & completeness

In [None]:
validation_records = []

for row in df_text.itertuples():
    valid = row.text_length > 500  # minimal sanity check
    validation_records.append({
        "paper": row.paper,
        "text_length": row.text_length,
        "valid_extraction": valid
    })

validation_df = pd.DataFrame(validation_records)
validation_df


## Cell 8 — Save outputs and milestone completion summary

In [None]:
# Save CSV outputs
df_text.to_csv(ROOT_DIR / "text_extraction_summary.csv", index=False)
df_sections.to_csv(ROOT_DIR / "section_extraction_summary.csv", index=False)
keywords_df.to_csv(ROOT_DIR / "global_keywords.csv", index=False)
comparison_df.to_csv(ROOT_DIR / "cross_paper_keywords.csv", index=False)
validation_df.to_csv(ROOT_DIR / "validation_report.csv", index=False)

print("All Milestone 2 outputs saved.")
print("Milestone 2 completed successfully.")
