In [1]:
import pdfplumber
import pandas as pd
import re
from pathlib import Path


In [2]:
PDF_PATH = Path("../data/raw/NCO_2015_stripped.pdf")

if not PDF_PATH.exists():
    raise FileNotFoundError(f"PDF not found at: {PDF_PATH}")

print("PDF found:", PDF_PATH)


PDF found: ..\data\raw\NCO_2015_stripped.pdf


In [5]:
all_pages_text = []

with pdfplumber.open(PDF_PATH) as pdf:
    total_pages = len(pdf.pages)
    print("Total Pages:", total_pages)

    for i in range(total_pages):   
        page = pdf.pages[i]
        text = page.extract_text()

        if text:
            all_pages_text.append({
                "page_no": i + 1,
                "text": text
            })

print("Extracted pages:", len(all_pages_text))


Total Pages: 1486
Extracted pages: 1486


In [6]:
df_raw_pages = pd.DataFrame(all_pages_text)
df_raw_pages.head()


Unnamed: 0,page_no,text
0,1,NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015\...
1,2,NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015\...
2,3,NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015\...
3,4,NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015\...
4,5,NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015\...


In [7]:
OUTPUT_PATH = Path("../data/processed/nco_raw_pages.csv")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

df_raw_pages.to_csv(OUTPUT_PATH, index=False)
print("Saved raw extracted pages to:", OUTPUT_PATH)


Saved raw extracted pages to: ..\data\processed\nco_raw_pages.csv


In [8]:
sample_page = 5  # change if you want

print(df_raw_pages.loc[df_raw_pages["page_no"] == sample_page, "text"].values[0][:2000])


NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015
NCO 2015 NCO 2004
1213.0101 Quality Assurance Standards In-Charge
1213.0102 Manager â€“ Customer Quality
1213.0200 Manager, Automotive Service Station 1239.20
1213.0201 Area Service Manager
1213.0202 Territory Service Manager
1213.9900 Managers, Other Services 1239.90
Family 1219 Business Services and Administration Managers
Not Elsewhere Classified
1219.0100 Manager, Import and Export 1227.30
1219.0101 Export Manager
1219.0200 Manager, Health Club/Manager, Fitness 1228.10
Club/Centre/Gym
Group 122 Sales, Marketing and Development Managers
Family 1221 Sales and Marketing Managers
1221.0100 Sales Manager (Wholesale Trade) 1233.10
1221.0200 Sales Manager (Retail Trade) 1233.20
1221.0301 Home Delivery Manager
1221.0401 Territory Sales Manager (Broadband)
1221.0501 Territory Sales Manager (Prepaid)
1221.0601 Sales Co-Ordinator (Media Org)
1221.0602 Sales Manager (Media Org)
1221.9900 Sales and Marketing Managers, Other 1233.90
Family 1222 Adver