# Milestone 2 – PDF Text Extraction, Section-wise Analysis & Validation

**Weeks 3–4 Deliverables**
- PDF text extraction
- Section-wise parsing
- Key information extraction
- Cross-paper comparison
- Validation of extracted content

This notebook is a **GitHub-safe version** (no widget metadata).
**Prerequisite:** PDFs from Milestone 1 must exist in `semantic_scholar_results/papers/`.


## Cell 1 — Install required dependencies

In [None]:
!pip install --upgrade pip
!pip install PyMuPDF pandas scikit-learn nltk
print('Dependencies installed successfully')

## Cell 2 — Imports and folder setup

In [None]:
import fitz
import pandas as pd
from pathlib import Path
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')

ROOT_DIR = Path('/content/semantic_scholar_results')
PAPERS_DIR = ROOT_DIR / 'papers'
TEXT_DIR = ROOT_DIR / 'extracted_text'
SECTION_DIR = ROOT_DIR / 'sectioned_text'

ROOT_DIR.mkdir(parents=True, exist_ok=True)
PAPERS_DIR.mkdir(parents=True, exist_ok=True)
TEXT_DIR.mkdir(parents=True, exist_ok=True)
SECTION_DIR.mkdir(parents=True, exist_ok=True)

print('Folders initialized')

## Cell 3 — PDF text extraction

In [None]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        text += page.get_text()
    return text.strip()

records = []
for pdf in PAPERS_DIR.glob('*.pdf'):
    content = extract_text_from_pdf(pdf)
    out = TEXT_DIR / f'{pdf.stem}.txt'
    out.write_text(content, encoding='utf-8')
    records.append({'paper': pdf.name, 'text_length': len(content)})

df_text = pd.DataFrame(records)
df_text

## Cell 4 — Section-wise text extraction

In [None]:
SECTION_HEADERS = ['abstract','introduction','related work','methodology','methods','results','discussion','conclusion','future work']

def split_into_sections(text):
    sections = {}
    lower = text.lower()
    for header in SECTION_HEADERS:
        match = re.search(rf'\n{header}\n', lower)
        if match:
            start = match.end()
            sections[header] = text[start:start+3000]
    return sections

section_records = []
for txt in TEXT_DIR.glob('*.txt'):
    text = txt.read_text(encoding='utf-8', errors='ignore')
    sections = split_into_sections(text)
    for sec, content in sections.items():
        out = SECTION_DIR / f'{txt.stem}_{sec}.txt'
        out.write_text(content, encoding='utf-8')
        section_records.append({'paper': txt.stem, 'section': sec, 'length': len(content)})

df_sections = pd.DataFrame(section_records)
df_sections

## Cell 5 — Key information extraction

In [None]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()

texts, names = [], []
for txt in TEXT_DIR.glob('*.txt'):
    t = clean_text(txt.read_text(encoding='utf-8', errors='ignore'))
    if len(t.split()) > 20:
        texts.append(t)
        names.append(txt.stem)

vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
tfidf = vectorizer.fit_transform(texts)
features = vectorizer.get_feature_names_out()
scores = tfidf.mean(axis=0).A1
keywords = sorted(zip(features, scores), key=lambda x: x[1], reverse=True)[:15]

keywords_df = pd.DataFrame(keywords, columns=['keyword','score'])
keywords_df

## Cell 6 — Cross-paper comparison

In [None]:
comparison = []
for name, text in zip(names, texts):
    vec = TfidfVectorizer(stop_words='english', max_features=100)
    tf = vec.fit_transform([text])
    kws = vec.get_feature_names_out()[:10]
    comparison.append({'paper': name, 'keywords': list(kws)})

comparison_df = pd.DataFrame(comparison)
comparison_df

## Cell 7 — Validation

In [None]:
validation = []
for row in df_text.itertuples():
    validation.append({'paper': row.paper, 'text_length': row.text_length, 'valid_extraction': row.text_length > 500})

validation_df = pd.DataFrame(validation)
validation_df

## Cell 8 — Save outputs

In [None]:
df_text.to_csv(ROOT_DIR/'text_extraction_summary.csv', index=False)
df_sections.to_csv(ROOT_DIR/'section_extraction_summary.csv', index=False)
keywords_df.to_csv(ROOT_DIR/'global_keywords.csv', index=False)
comparison_df.to_csv(ROOT_DIR/'cross_paper_keywords.csv', index=False)
validation_df.to_csv(ROOT_DIR/'validation_report.csv', index=False)

print('Milestone 2 completed successfully.')