<a href="https://colab.research.google.com/github/adeshsingh5505/pdfExtract/blob/main/task1(b).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfplumber scikit-learn pandas

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import pdfplumber
import pandas as pd
import re

def extract_blocks_features_to_csv(pdf_path, output_csv):
    blocks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            last_bottom = 0
            words = page.extract_words(extra_attrs=["fontname", "size", "adv", "upright"])

            # Group by Y coordinate (approximate lines)
            lines = {}
            for w in words:
                top_rounded = round(w['top'] / 3) * 3  # lines within ~3pt
                lines.setdefault(top_rounded, []).append(w)

            for top in sorted(lines):
                line_words = lines[top]
                text = " ".join(w["text"] for w in line_words)
                font_sizes = [w["size"] for w in line_words if w.get("size")]
                fontnames = [w["fontname"] for w in line_words if w.get("fontname")]
                is_bold_list = [1 if ("Bold" in str(f) or "bold" in str(f).lower()) else 0 for f in fontnames]

                font_size = max(font_sizes) if font_sizes else 0
                font_name = fontnames[0] if fontnames else ""
                is_bold = max(is_bold_list) if is_bold_list else 0

                stripped_text = text.strip()
                if stripped_text.isupper():
                    text_case = "ALLCAPS"
                elif stripped_text.istitle():
                    text_case = "Title"
                elif stripped_text.islower():
                    text_case = "lower"
                else:
                    text_case = "mixed"

                # Heuristic: numbered prefix pattern
                prefix_pattern = 0
                if re.match(r"^(\d+\.)+(\s|$)", stripped_text):
                    prefix_pattern = 1

                whitespace_above = max(0, top - last_bottom) if last_bottom else top
                line_length = len(stripped_text.split())

                blocks.append({
                    "page_num": page_num,
                    "text": text,
                    "font_size": font_size,
                    "font_name": font_name,
                    "is_bold": is_bold,
                    "y0": top,
                    "whitespace_above": whitespace_above,
                    "prefix_pattern": prefix_pattern,
                    "text_case": text_case,
                    "line_length": line_length,
                    "label": ""  # fill this manually ("Title", "H1", "H2", etc.) after export
                })

                last_bottom = top

    df = pd.DataFrame(blocks)
    df.to_csv(output_csv, index=False)
    print(f"Extracted {len(df)} rows to {output_csv}")

# Usage
extract_blocks_features_to_csv("/content/Gmail - Application Submit.pdf", "train_blocks_unlabeled.csv")




Extracted 37 rows to train_blocks_unlabeled.csv


**Feature Extraction**

In [2]:
import pdfplumber
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pickle

# Corrected extract_blocks_features function
def extract_blocks_features(pdf_path):
    blocks = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            last_y0 = 0  # Track last y0 for whitespace calculation
            for page_num, page in enumerate(pdf.pages):
                for obj in page.extract_words(extra_attrs=["fontname", "size"]):
                    text = obj["text"].strip()
                    if not text:  # Skip empty text
                        continue
                    block = {
                        "text": text,
                        "font_size": obj["size"],
                        "font_name": obj["fontname"],
                        "is_bold": 1 if "Bold" in obj["fontname"] or "bold" in obj["fontname"].lower() else 0,
                        "y0": obj["top"],
                        "page_num": page_num + 1,
                        "line_length": len(text.split()),
                        "whitespace_above": obj["top"] - last_y0 if blocks else obj["top"],
                    }
                    # Prefix pattern
                    block["prefix_pattern"] = 1 if text.split() and text.split()[0].rstrip(".").replace(".", "").isdigit() else 0
                    # Text case
                    block["text_case"] = ("ALLCAPS" if text.isupper() else
                                         "Title" if text.istitle() else
                                         "lower" if text.islower() else "mixed")
                    blocks.append(block)
                    last_y0 = obj["top"]
        return pd.DataFrame(blocks)
    except FileNotFoundError:
        print(f"Error: PDF file {pdf_path} not found.")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return pd.DataFrame()

# Load and train model (assuming CSV is correctly formatted)
try:
    df = pd.read_csv("/content/train_blocks_unlabeled (4).csv")
    df = pd.get_dummies(df, columns=["font_name", "text_case"])
    X = df.drop(["label", "text"], axis=1)
    y = df["label"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    dtree = DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=42)
    dtree.fit(X_train, y_train)
    print(f"Validation Accuracy: {dtree.score(X_test, y_test):.2f}")

    # Save training columns for inference
    with open("training_columns.pkl", "wb") as f:
        pickle.dump(X.columns.tolist(), f)

except FileNotFoundError:
    print("Error: Training CSV file not found.")
    exit()
except Exception as e:
    print(f"Error during training: {e}")
    exit()

# Inference on new PDF
try:
    new_df = extract_blocks_features("/content/file05.pdf")
    if new_df.empty:
        print("Error: No data extracted from PDF.")
        exit()

    new_df = pd.get_dummies(new_df, columns=["font_name", "text_case"])

    # Load training columns
    with open("training_columns.pkl", "rb") as f:
        training_columns = pickle.load(f)

    # Align columns
    new_X = new_df.reindex(columns=training_columns, fill_value=0)
    predicted_labels = dtree.predict(new_X)

    results = new_df[["page_num", "text"]].copy()
    results["predicted_label"] = predicted_labels
    print(results)

except Exception as e:
    print(f"Error during prediction: {e}")

Validation Accuracy: 0.96
    page_num              text predicted_label
0          1          ADDRESS:           Other
1          1           TOPJUMP           Other
2          1              3735           Other
3          1           PARKWAY           Other
4          1            PIGEON           Other
5          1            FORGE,           Other
6          1                TN           Other
7          1             37863           Other
8          1             (NEAR           Other
9          1             DIXIE           Other
10         1          STAMPEDE           Other
11         1                ON           Other
12         1               THE           Other
13         1          PARKWAY)           Other
14         1             RSVP:           Other
15         1  ----------------              H2
16         1            CLOSED           Other
17         1              TOED           Other
18         1             SHOES           Other
19         1               ARE    

**Training**

In [4]:
import pandas as pd
df = pd.read_csv("/content/train_blocks_unlabeled (4).csv")  # Hand-labeled dataset

# Encode categorical variables
df = pd.get_dummies(df, columns=["font_name", "text_case"])
X = df.drop(["label", "text"], axis=1)
y = df["label"]


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, random_state=42)

dtree = DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=42)
dtree.fit(X_train, y_train)

print(f"Validation Accuracy: {dtree.score(X_test, y_test):.2f}")


Validation Accuracy: 0.96


**Extraction on new files**

In [8]:
# For inference on new PDF
new_df = extract_blocks_features("/content/file02.pdf")
new_df = pd.get_dummies(new_df, columns=["font_name", "text_case"])

# Ensure columns align with training
new_X = new_df.reindex(columns=X.columns, fill_value=0)
predicted_labels = dtree.predict(new_X)

results = new_df[["page_num", "text"]].copy()
results["predicted_label"] = predicted_labels
print(results)


      page_num            text predicted_label
0            1        Overview              H3
1            1      Foundation              H3
2            1           Level              H3
3            1      Extensions              H3
4            1         Version           Other
...        ...             ...             ...
2381        12   International           Other
2382        12        Software           Other
2383        12         Testing           Other
2384        12  Qualifications           Other
2385        12           Board           Other

[2386 rows x 3 columns]
