In [1]:
import pandas as pd
import re
from pathlib import Path

RAW_PATH = Path("../data/processed/nco_raw_pages.csv")

df_pages = pd.read_csv(RAW_PATH)
print("Loaded pages:", df_pages.shape)

df_pages.head()


Loaded pages: (1486, 2)


Unnamed: 0,page_no,text
0,1,NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015\...
1,2,NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015\...
2,3,NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015\...
3,4,NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015\...
4,5,NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015\...


In [2]:
full_text = "\n".join(df_pages["text"].dropna().tolist())

print("Total characters:", len(full_text))
print(full_text[:2000])


Total characters: 3414214
NATIONAL CLASSIFICATION OF OCCUPATIONS - 2015
NCO 2015 NCO 2004
Division 1 Managers
Sub- 11 Chief Executives, Senior Officials and Legislators
Division
Group 111 Legislators and Senior Officials
Family 1111 Legislators
1111.0100 Elected Official, Union Government 1111.10
1111.0200 Elected Official, State Government 1112.10
1111.0300 Elected Official, Local Bodies 1113.10
1111.9900 Legislators, Other 1119.90
Family 1112 Senior Government Officials
1112.0100 Administrative Official, Union Government 1121.10
1112.0200 Diplomat 1121.20
1112.0300 Executive Officials, Union Government 1121.30
1112.0400 Administrative Official, State Government 1122.10
1112.0500 Executive Official, State Government 1122.20
1112.0600 Administrative and Executive Officials, Quasi 1123.10
Government (Central)
1112.0700 Administrative & Executive Officials, Quasi 1123.20
Government (State)
1112.0800 Administrative and Executive Officials, Local 1124.10
Bodies
1112.9900 Senior Government 

In [3]:
def clean_noise(text: str) -> str:
    text = re.sub(r"NATIONAL CLASSIFICATION OF OCCUPATIONS\s*-\s*2015", " ", text)
    text = re.sub(r"\n+\s*\n+", "\n", text)
    text = re.sub(r"[ ]{2,}", " ", text)
    return text.strip()

cleaned_text = clean_noise(full_text)

print("Cleaned characters:", len(cleaned_text))
print(cleaned_text[:2000])


Cleaned characters: 3345858
NCO 2015 NCO 2004
Division 1 Managers
Sub- 11 Chief Executives, Senior Officials and Legislators
Division
Group 111 Legislators and Senior Officials
Family 1111 Legislators
1111.0100 Elected Official, Union Government 1111.10
1111.0200 Elected Official, State Government 1112.10
1111.0300 Elected Official, Local Bodies 1113.10
1111.9900 Legislators, Other 1119.90
Family 1112 Senior Government Officials
1112.0100 Administrative Official, Union Government 1121.10
1112.0200 Diplomat 1121.20
1112.0300 Executive Officials, Union Government 1121.30
1112.0400 Administrative Official, State Government 1122.10
1112.0500 Executive Official, State Government 1122.20
1112.0600 Administrative and Executive Officials, Quasi 1123.10
Government (Central)
1112.0700 Administrative & Executive Officials, Quasi 1123.20
Government (State)
1112.0800 Administrative and Executive Officials, Local 1124.10
Bodies
1112.9900 Senior Government Officials, Other 1129.90
Family 1113 Traditi

In [4]:
OUTPUT_TXT = Path("../data/processed/nco_cleaned_text.txt")
OUTPUT_TXT.write_text(cleaned_text, encoding="utf-8")

print("Saved cleaned text to:", OUTPUT_TXT)


Saved cleaned text to: ..\data\processed\nco_cleaned_text.txt


In [5]:
# Try to find 4-digit codes (common in NCO format)
codes_found = re.findall(r"\b\d{4}\b", cleaned_text)

print("Total 4-digit code matches:", len(codes_found))
print("First 50 codes:", codes_found[:50])


Total 4-digit code matches: 32278
First 50 codes: ['2015', '2004', '1111', '1111', '0100', '1111', '1111', '0200', '1112', '1111', '0300', '1113', '1111', '9900', '1119', '1112', '1112', '0100', '1121', '1112', '0200', '1121', '1112', '0300', '1121', '1112', '0400', '1122', '1112', '0500', '1122', '1112', '0600', '1123', '1112', '0700', '1123', '1112', '0800', '1124', '1112', '9900', '1129', '1113', '1114', '2015', '2004', '1114', '0100', '1141']


In [6]:
def show_context(text, code="1111", window=500):
    idx = text.find(code)
    if idx == -1:
        print("Code not found")
        return
    start = max(0, idx - window)
    end = min(len(text), idx + window)
    print(text[start:end])

show_context(cleaned_text, "1111", 400)


NCO 2015 NCO 2004
Division 1 Managers
Sub- 11 Chief Executives, Senior Officials and Legislators
Division
Group 111 Legislators and Senior Officials
Family 1111 Legislators
1111.0100 Elected Official, Union Government 1111.10
1111.0200 Elected Official, State Government 1112.10
1111.0300 Elected Official, Local Bodies 1113.10
1111.9900 Legislators, Other 1119.90
Family 1112 Senior Government Officials
1112.0100 Administrative Official, Union Government 1121.10
1112.0200 Diplomat 1121.20
1112.0300 Executive Officials, Union Government 1121.30
1112.040


In [7]:
show_context(cleaned_text, "1121", 400)


tives, Senior Officials and Legislators
Division
Group 111 Legislators and Senior Officials
Family 1111 Legislators
1111.0100 Elected Official, Union Government 1111.10
1111.0200 Elected Official, State Government 1112.10
1111.0300 Elected Official, Local Bodies 1113.10
1111.9900 Legislators, Other 1119.90
Family 1112 Senior Government Officials
1112.0100 Administrative Official, Union Government 1121.10
1112.0200 Diplomat 1121.20
1112.0300 Executive Officials, Union Government 1121.30
1112.0400 Administrative Official, State Government 1122.10
1112.0500 Executive Official, State Government 1122.20
1112.0600 Administrative and Executive Officials, Quasi 1123.10
Government (Central)
1112.0700 Administrative & Executive Officials, Quasi 1123.20
Government (State)
1112.0800 Administrative and


In [8]:
show_context(cleaned_text, "1141", 400)



Government (Central)
1112.0700 Administrative & Executive Officials, Quasi 1123.20
Government (State)
1112.0800 Administrative and Executive Officials, Local 1124.10
Bodies
1112.9900 Senior Government Officials, Other 1129.90
Family 1113 Traditional Chiefs and Heads of Villages
Family 1114 Senior Officials of Special Interest Organizations
VOLUME I 33
NCO 2015 NCO 2004
1114.0100 Political Worker 1141.10
1114.0200 Senior Officials of Employers, Workers and Other 1142.00
Economic Interest Organizations
1114.0300 Director, Disaster Management Services 1143.10
1114.9900 Senior Officials of Special Interest Organizations, 1143.90
Other
Group 112 Managing Directors and Chief Executives
Family 1120 Managing Directors and Chief Executives
1120.0100 Working Proprietor, Electricity 1211.10
1120.020


In [9]:
pattern = r"(\d{4}\.\d{4})\s+(.+?)\s+(\d{4}\.\d{2})"

matches = re.findall(pattern, cleaned_text)

print("Total occupation records found:", len(matches))
print("First 10 records:\n", matches[:10])


Total occupation records found: 3599
First 10 records:
 [('1111.0100', 'Elected Official, Union Government', '1111.10'), ('1111.0200', 'Elected Official, State Government', '1112.10'), ('1111.0300', 'Elected Official, Local Bodies', '1113.10'), ('1111.9900', 'Legislators, Other', '1119.90'), ('1112.0100', 'Administrative Official, Union Government', '1121.10'), ('1112.0200', 'Diplomat', '1121.20'), ('1112.0300', 'Executive Officials, Union Government', '1121.30'), ('1112.0400', 'Administrative Official, State Government', '1122.10'), ('1112.0500', 'Executive Official, State Government', '1122.20'), ('1112.0600', 'Administrative and Executive Officials, Quasi', '1123.10')]


In [10]:
df_occ = pd.DataFrame(matches, columns=["nco_2015_code", "title", "nco_2004_code"])

df_occ.head(20)


Unnamed: 0,nco_2015_code,title,nco_2004_code
0,1111.01,"Elected Official, Union Government",1111.1
1,1111.02,"Elected Official, State Government",1112.1
2,1111.03,"Elected Official, Local Bodies",1113.1
3,1111.99,"Legislators, Other",1119.9
4,1112.01,"Administrative Official, Union Government",1121.1
5,1112.02,Diplomat,1121.2
6,1112.03,"Executive Officials, Union Government",1121.3
7,1112.04,"Administrative Official, State Government",1122.1
8,1112.05,"Executive Official, State Government",1122.2
9,1112.06,"Administrative and Executive Officials, Quasi",1123.1


In [11]:
df_occ["title"] = df_occ["title"].str.strip()
df_occ = df_occ.drop_duplicates()

print("After duplicates removed:", df_occ.shape)
df_occ.head()


After duplicates removed: (3598, 3)


Unnamed: 0,nco_2015_code,title,nco_2004_code
0,1111.01,"Elected Official, Union Government",1111.1
1,1111.02,"Elected Official, State Government",1112.1
2,1111.03,"Elected Official, Local Bodies",1113.1
3,1111.99,"Legislators, Other",1119.9
4,1112.01,"Administrative Official, Union Government",1121.1


In [12]:
OUTPUT_CSV = Path("../data/processed/nco_structured.csv")
df_occ.to_csv(OUTPUT_CSV, index=False)

print("Saved structured occupation dataset to:", OUTPUT_CSV)


Saved structured occupation dataset to: ..\data\processed\nco_structured.csv
