In [1]:
# ==============================
# Notebook 1 — Resume Ingestion & Cleaning 
# ==============================

# --- Paths & imports (your originals) ---
BASE_DIR = r"D:\Projects\ResumeJobRecommender"
RAW_DIR  = BASE_DIR + r"\data\raw"
PROC_DIR = BASE_DIR + r"\data\processed"

PDF_DIR  = RAW_DIR + r"\resumes_pdf"                  # subfolders per category
IT_CSV   = RAW_DIR + r"\UpdatedResumeDataset.csv"     # has Category + Resume
OUT_CSV  = PROC_DIR + r"\resume_cleaned.csv"          # final minimal output

import os, re, unicodedata
from pathlib import Path
import pandas as pd
import numpy as np
import pdfplumber

os.makedirs(PROC_DIR, exist_ok=True)
pd.set_option("display.max_colwidth", 200)

print(PDF_DIR)
print(IT_CSV)


D:\Projects\ResumeJobRecommender\data\raw\resumes_pdf
D:\Projects\ResumeJobRecommender\data\raw\UpdatedResumeDataset.csv


#### Load PDF resumes (categories from subfolders)

In [2]:
# ==============================
# Step 1 — PDFs -> pdf_df (resume_id, category, text)
# ==============================

pdf_paths = list(Path(PDF_DIR).rglob("*.pdf"))
print("PDF files found:", len(pdf_paths))

rows = []
i = 1
for p in pdf_paths:
    category_value = p.parent.name
    text_value = ""
    pdf = pdfplumber.open(str(p))
    for page in pdf.pages:
        t = page.extract_text()
        if t is not None:
            text_value += "\n" + t
    pdf.close()
    rows.append({
        "resume_id": f"PDF_{i:06d}",
        "category": category_value,
        "text": (text_value or "").strip()
    })
    i += 1

pdf_df = pd.DataFrame(rows)
print("PDF resumes loaded:", pdf_df.shape)
display(pdf_df.head(3))

# Save
pdf_df.to_csv(PROC_DIR + r"\pdf_resumes_raw.csv", index=False)
print("Saved:", PROC_DIR + r"\pdf_resumes_raw.csv")


PDF files found: 1196
PDF resumes loaded: (1196, 3)


Unnamed: 0,resume_id,category,text
0,PDF_000001,ACCOUNTANT,"ACCOUNTANT\nSummary\nAccountant for a Medium sized Company\nExperience\n01/2009 to Current\nAccountant Company Name ï¼​ City , State\nHired by their CPA firm to handle all accounting and job cost ..."
1,PDF_000002,ACCOUNTANT,"ACCOUNTANT\nInterests\nBuffalo Creek Golf Club, Rockwall, TX May 2012-August 2012 *Maintain golf carts and driving range\nExperience\n03/2016 to 03/2018\nAccountant Company Name ï¼​ City , State\n..."
2,PDF_000003,ACCOUNTANT,"ACCOUNTANT\nSummary\nIf you need someone who delivers sharp results, I can help. Well qualified and results oriented Accounting Professional with over fourteen years of\nsuccessful experience in p..."


Saved: D:\Projects\ResumeJobRecommender\data\processed\pdf_resumes_raw.csv


#### Load the IT CSV (UpdatedResumeDatset.csv)

In [3]:
# ==============================
# Step 2 — IT CSV -> it_raw (resume_id, category, text)
# ==============================

# Load and normalize column names
it_raw = pd.read_csv(IT_CSV, encoding="utf-8", encoding_errors="ignore")
display(it_raw.head(2))

# Make sure we have exactly 'category' and 'text'
it_raw.rename(columns={"Category": "category", "Resume": "text"}, inplace=True)

# Add resume_id for IT rows
it_raw["resume_id"] = ["IT_" + str(k+1).zfill(6) for k in range(len(it_raw))]

# Keep only the three columns and types
it_raw = it_raw[["resume_id","category","text"]].copy()
it_raw["resume_id"] = it_raw["resume_id"].astype(str)
it_raw["category"]  = it_raw["category"].astype(str)
it_raw["text"]      = it_raw["text"].astype(str)

print("IT resumes loaded:", it_raw.shape)
display(it_raw.head(3))

# Save intermediate
it_raw.to_csv(PROC_DIR + r"\it_resumes_raw.csv", index=False)
print("Saved:", PROC_DIR + r"\it_resumes_raw.csv")


Unnamed: 0,Category,Resume
0,Data Science,"Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decisi..."
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E UIT-RGPV\r\nData Scientist \r\n\r\nData Scientist - Matelabs\r\nSkill Details \r\nPython- Exprience - Less than 1 year months\r\nStatsmodels- Expri...


IT resumes loaded: (962, 3)


Unnamed: 0,resume_id,category,text
0,IT_000001,Data Science,"Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decisi..."
1,IT_000002,Data Science,Education Details \r\nMay 2013 to May 2017 B.E UIT-RGPV\r\nData Scientist \r\n\r\nData Scientist - Matelabs\r\nSkill Details \r\nPython- Exprience - Less than 1 year months\r\nStatsmodels- Expri...
2,IT_000003,Data Science,"Areas of Interest Deep Learning, Control System Design, Programming in-Python, Electric Machinery, Web Development, Analytics Technical Activities q Hindustan Aeronautics Limited, Bangalore - For ..."


Saved: D:\Projects\ResumeJobRecommender\data\processed\it_resumes_raw.csv


#### Combine both pdf data and loaded IT resumes data

In [5]:
# ==============================
# Step 3 — Combine sources (keep resume_id, category, text)
# ==============================

resume_raw = pd.concat(
    [pdf_df[["resume_id","category","text"]], it_raw[["resume_id","category","text"]]],
    ignore_index=True
)

# Enforce dtypes
resume_raw["resume_id"] = resume_raw["resume_id"].astype(str)
resume_raw["category"]  = resume_raw["category"].astype(str)
resume_raw["text"]      = resume_raw["text"].astype(str)

print("Combined resumes:", resume_raw.shape)
display(resume_raw.sample(5, random_state=42))

# Save combined raw
resume_raw.to_csv(PROC_DIR + r"\resumes_combined_raw.csv", index=False)
print("Saved:", PROC_DIR + r"\resumes_combined_raw.csv")


Combined resumes: (2158, 3)


Unnamed: 0,resume_id,category,text
997,PDF_000998,PUBLIC-RELATIONS,"OFFICE ADMINISTRATOR\nProfessional Profile\nSkilled and highly organized professional, leveraging operational and interpersonal abilities developed during education and experience to excel in\nadm..."
361,PDF_000362,CONSULTANT,CONSULTANT\nSummary\nCollege graduate with interest in insurance claims work. Proven record of success in all endeavors. Strong work ethic and focus on the details.\nExcited to begin career in inv...
416,PDF_000417,DIGITAL-MEDIA,DIGITAL MEDIA BUYER\nProfessional Summary\nVersatile digital marketerÂ bringing\nHighlights\nSproutSocial\nPay Per Click (PPC)\nHootsuite\nGoogle Adwords\nMarin Software\nGoogle Analytics\nDrupal\...
1112,PDF_001113,SALES,SALES MANAGER\nHighlights\nMS Office proficiency\nTeam leadership\nExceptional time management\nMicrosoft Outlook\nGoal-oriented\nLotus Notes\nAccomplishments\nManaged a successful sales team of 1...
485,PDF_000486,DIGITAL-MEDIA,SR DIGITAL ANALYTICS MANAGER\nSummary\nMindful innovator with a proven track record in delivering digital analytics solutions to configure fundamentally sound framework for multiple\nwebsites\nPre...


Saved: D:\Projects\ResumeJobRecommender\data\processed\resumes_combined_raw.csv


In [27]:
# ==============================
# Step 4 — Clean text (preserve raw + add clean) and Save Final
# ==============================

# Keep raw
resumes_clean = resume_raw.copy()
resumes_clean.rename(columns={"text": "text_raw"}, inplace=True)

# Build text_clean (mojibake fixes, strip HTML/URLs/punct, lowercase)
clean_list = []
raw_list = resumes_clean["text_raw"].tolist()

for s in raw_list:
    t = str(s)
    # common mojibake fixes seen in dataset
    t = t.replace("Ã¢Â€Â¢", "•").replace("Ã¢Â€Â“", "-").replace("Ã¢Â€Â™", "'")
    t = t.replace("NaÃ¯ve", "Naive").replace("Ã©", "e")
    # normalize, strip html, remove urls, punctuation, extra spaces
    t = unicodedata.normalize("NFKC", t)
    t = re.sub(r"<[^>]+>", " ", t)
    t = re.sub(r"http\S+|www\S+|https\S+", " ", t)
    t = re.sub(r"[^\w\s]", " ", t)
    t = re.sub(r"[\x00-\x1f\x7f-\x9f]", " ", t)
    t = re.sub(r"\s+", " ", t).strip().lower()
    clean_list.append(t)

resumes_clean["text_clean"] = clean_list

print("Cleaned data preview:")
display(resumes_clean.head(3))

# Save single final file
resumes_clean.to_csv(PROC_DIR + r"\resume_cleaned.csv", index=False)
print(" Final cleaned data saved:", PROC_DIR + r"\resume_cleaned.csv")


Cleaned data preview:


Unnamed: 0,resume_id,category,text_raw,text_clean
0,PDF_000001,ACCOUNTANT,"ACCOUNTANT\nSummary\nAccountant for a Medium sized Company\nExperience\n01/2009 to Current\nAccountant Company Name ï¼​ City , State\nHired by their CPA firm to handle all accounting and job cost ...",accountant summary accountant for a medium sized company experience 01 2009 to current accountant company name ï1 4 city state hired by their cpa firm to handle all accounting and job cost reporti...
1,PDF_000002,ACCOUNTANT,"ACCOUNTANT\nInterests\nBuffalo Creek Golf Club, Rockwall, TX May 2012-August 2012 *Maintain golf carts and driving range\nExperience\n03/2016 to 03/2018\nAccountant Company Name ï¼​ City , State\n...",accountant interests buffalo creek golf club rockwall tx may 2012 august 2012 maintain golf carts and driving range experience 03 2016 to 03 2018 accountant company name ï1 4 city state reconcile ...
2,PDF_000003,ACCOUNTANT,"ACCOUNTANT\nSummary\nIf you need someone who delivers sharp results, I can help. Well qualified and results oriented Accounting Professional with over fourteen years of\nsuccessful experience in p...",accountant summary if you need someone who delivers sharp results i can help well qualified and results oriented accounting professional with over fourteen years of successful experience in positi...


 Final cleaned data saved: D:\Projects\ResumeJobRecommender\data\processed\resume_cleaned.csv


In [28]:

print("Final cleaned resume dataset:", resumes_clean.shape)
display(resumes_clean.head(1))


Final cleaned resume dataset: (2158, 4)


Unnamed: 0,resume_id,category,text_raw,text_clean
0,PDF_000001,ACCOUNTANT,"ACCOUNTANT\nSummary\nAccountant for a Medium sized Company\nExperience\n01/2009 to Current\nAccountant Company Name ï¼​ City , State\nHired by their CPA firm to handle all accounting and job cost ...",accountant summary accountant for a medium sized company experience 01 2009 to current accountant company name ï1 4 city state hired by their cpa firm to handle all accounting and job cost reporti...


In [11]:
print("\nSample processed text:\n", resumes_clean["text_clean"].iloc[0][:5000])


Sample processed text:
 accountant summary accountant for a medium sized company experience 01 2009 to current accountant company name ï1 4 city state hired by their cpa firm to handle all accounting and job cost reporting 01 2007 to 01 2009 accountant company name ï1 4 city state hired by their cpa firm to handle all accounting functions 01 1997 to 01 2007 accountant company name ï1 4 city state installed new peachtree accounting system installed new computer system using a local area network and added a web site education and training 1974 b s business administration accounting university of cincinnati ï1 4 city state business administration accounting interests annapolis amblers walking club president trailmaster maryland volkssport assn president chesapeake civil war roundtable skills accounting cpa local area network peachtree accounting reporting web site additional information interests annapolis amblers walking club president trailmaster maryland volkssport assn president ches

In [18]:
# ==============================
# Checking structure and nulls for resumes_clean
# ==============================

# basic shape & missing values
print("rows, cols:", resumes_clean.shape)
print("\nMissing values per column:")
print(resumes_clean.isna().sum())

rows, cols: (2158, 4)

Missing values per column:
resume_id     0
category      0
text_raw      0
text_clean    0
dtype: int64


In [19]:
# category balance
print("\nTop 20 categories:")
cat_counts = resumes_clean["category"].value_counts()
print(cat_counts.head(20))


Top 20 categories:
category
HR                        154
BUSINESS-DEVELOPMENT      120
INFORMATION-TECHNOLOGY    120
ENGINEERING               118
FINANCE                   118
SALES                     116
BANKING                   115
CONSULTANT                115
PUBLIC-RELATIONS          111
DIGITAL-MEDIA              96
Java Developer             84
Testing                    70
ACCOUNTANT                 57
DevOps Engineer            55
Python Developer           48
Web Designing              45
Hadoop                     42
Operations Manager         40
Mechanical Engineer        40
Sales                      40
Name: count, dtype: int64


In [21]:
# text length diagnostics
resumes_clean["len_raw"] = resumes_clean["text_raw"].astype(str).str.len()
resumes_clean["len_clean"] = resumes_clean["text_clean"].astype(str).str.len()

print("\nRaw text length summary:")
print(resumes_clean["len_raw"].describe())
print("\nClean text length summary:")
print(resumes_clean["len_clean"].describe())




Raw text length summary:
count     2158.000000
mean      4771.951344
std       3113.190469
min          0.000000
25%       2367.000000
50%       4876.500000
75%       6267.750000
max      35115.000000
Name: len_raw, dtype: float64

Clean text length summary:
count     2158.000000
mean      4587.708526
std       3025.100898
min          0.000000
25%       2229.000000
50%       4758.000000
75%       6085.500000
max      34087.000000
Name: len_clean, dtype: float64


In [22]:
# duplicates check by cleaned text
dup_mask = resumes_clean.duplicated(subset=["text_clean"], keep=False)
num_dups = dup_mask.sum()
print(f"\nPossible duplicate resumes (by cleaned text): {num_dups}")




Possible duplicate resumes (by cleaned text): 960


In [25]:
# one cleaned sample per category
sample_view = resumes_clean.groupby("category", as_index=False).head(1)[
    ["resume_id", "category", "text_clean"]
]
sample_view.head()

Unnamed: 0,resume_id,category,text_clean
0,PDF_000001,ACCOUNTANT,accountant summary accountant for a medium sized company experience 01 2009 to current accountant company name ï1 4 city state hired by their cpa firm to handle all accounting and job cost reporti...
57,PDF_000058,BANKING,registered client service associate summary to obtain a position where my years of experience in the client support environment and proven track record of maintaining and developing new businesses...
172,PDF_000173,BUSINESS-DEVELOPMENT,business development manager staffing manager professional summary innovative manager seeks position offering opportunities for new professional and personal challenges self starter with a positiv...
292,PDF_000293,CONSULTANT,consultant professional summary astute and innovative attorney with strong ability to develop and implement effective litigation case strategies and write persuasive motions proven ability to reso...
407,PDF_000408,DIGITAL-MEDIA,media activities specialist summary multi tasking media relations results oriented strategic initiatives event planning writer editor manager supervisor flexibility adaptable highlights greatly im...


In [26]:
resumes_clean.head()

Unnamed: 0,resume_id,category,text_raw,text_clean,len_raw,len_clean
0,PDF_000001,ACCOUNTANT,"ACCOUNTANT\nSummary\nAccountant for a Medium sized Company\nExperience\n01/2009 to Current\nAccountant Company Name ï¼​ City , State\nHired by their CPA firm to handle all accounting and job cost ...",accountant summary accountant for a medium sized company experience 01 2009 to current accountant company name ï1 4 city state hired by their cpa firm to handle all accounting and job cost reporti...,1030,1002
1,PDF_000002,ACCOUNTANT,"ACCOUNTANT\nInterests\nBuffalo Creek Golf Club, Rockwall, TX May 2012-August 2012 *Maintain golf carts and driving range\nExperience\n03/2016 to 03/2018\nAccountant Company Name ï¼​ City , State\n...",accountant interests buffalo creek golf club rockwall tx may 2012 august 2012 maintain golf carts and driving range experience 03 2016 to 03 2018 accountant company name ï1 4 city state reconcile ...,1952,1895
2,PDF_000003,ACCOUNTANT,"ACCOUNTANT\nSummary\nIf you need someone who delivers sharp results, I can help. Well qualified and results oriented Accounting Professional with over fourteen years of\nsuccessful experience in p...",accountant summary if you need someone who delivers sharp results i can help well qualified and results oriented accounting professional with over fourteen years of successful experience in positi...,2649,2572
3,PDF_000004,ACCOUNTANT,"ACCOUNTANT\nSummary\nExperienced, detail-oriented Accountant who effectively manages multiple projects, and possesses superior organizational and communication\nskills is seeking a challenging pos...",accountant summary experienced detail oriented accountant who effectively manages multiple projects and possesses superior organizational and communication skills is seeking a challenging position...,3394,3311
4,PDF_000005,ACCOUNTANT,"ACCOUNTANT\nSummary\nAccountant with over a decade of diverse professional experience including corporate and small business accounting, tax preparation and\naccounting services. Dedicated and rel...",accountant summary accountant with over a decade of diverse professional experience including corporate and small business accounting tax preparation and accounting services dedicated and reliable...,3430,3348
