In [12]:
# ==============================
# Notebook 2 — Resume TF-IDF
# ==============================

BASE_DIR = r"D:\Projects\ResumeJobRecommender"
RAW_DIR  = BASE_DIR + r"\data\raw"
PROC_DIR = BASE_DIR + r"\data\processed"

import os, re, json, unicodedata
import numpy as np
import pandas as pd

os.makedirs(PROC_DIR, exist_ok=True)
pd.set_option("display.max_colwidth", 200)

print("PROC_DIR:", PROC_DIR)


PROC_DIR: D:\Projects\ResumeJobRecommender\data\processed


In [13]:
resumes = pd.read_csv(PROC_DIR + r"\resume_cleaned.csv")
print("Loaded:", resumes.shape)
resumes.head(3)


Loaded: (2158, 4)


Unnamed: 0,resume_id,category,text_raw,text_clean
0,PDF_000001,ACCOUNTANT,"ACCOUNTANT\nSummary\nAccountant for a Medium sized Company\nExperience\n01/2009 to Current\nAccountant Company Name ï¼​ City , State\nHired by their CPA firm to handle all accounting and job cost ...",accountant summary accountant for a medium sized company experience 01 2009 to current accountant company name ï1 4 city state hired by their cpa firm to handle all accounting and job cost reporti...
1,PDF_000002,ACCOUNTANT,"ACCOUNTANT\nInterests\nBuffalo Creek Golf Club, Rockwall, TX May 2012-August 2012 *Maintain golf carts and driving range\nExperience\n03/2016 to 03/2018\nAccountant Company Name ï¼​ City , State\n...",accountant interests buffalo creek golf club rockwall tx may 2012 august 2012 maintain golf carts and driving range experience 03 2016 to 03 2018 accountant company name ï1 4 city state reconcile ...
2,PDF_000003,ACCOUNTANT,"ACCOUNTANT\nSummary\nIf you need someone who delivers sharp results, I can help. Well qualified and results oriented Accounting Professional with over fourteen years of\nsuccessful experience in p...",accountant summary if you need someone who delivers sharp results i can help well qualified and results oriented accounting professional with over fourteen years of successful experience in positi...


In [14]:
# ==============================
# STEP 2 — Clean Category Labels (IN PLACE)
# ==============================
# Make categories readable and consistent for grouping/EDA.
# (We do not keep a second column; we just normalize in place.)

resumes["category"] = (
    resumes["category"]
    .astype(str)
    .str.strip()
    .str.replace(r"[_\-]+", " ", regex=True)   # "_" or "-" -> space
    .str.replace(r"\s+", " ", regex=True)      # collapse spaces
    .str.title()                                # Title Case
    .str.replace(r"\bHr\b", "HR", regex=True)   # fix common acronyms
    .str.replace(r"\bIt\b", "IT", regex=True)
)

# Quick peek
resumes[["resume_id","category"]].head(5)


Unnamed: 0,resume_id,category
0,PDF_000001,Accountant
1,PDF_000002,Accountant
2,PDF_000003,Accountant
3,PDF_000004,Accountant
4,PDF_000005,Accountant


In [15]:
# ==============================
# STEP 3 — (Optional) Exact Dedup by Cleaned Text
# ==============================
# Many IT resumes share boilerplate; exact duplicates may exist by text_clean.
# We only *report* them; set DO_DEDUP=True to actually drop them.

dup_mask = resumes.duplicated(subset=["text_clean"], keep="first")
print("Exact duplicate rows (by text_clean):", int(dup_mask.sum()))

DO_DEDUP = False  # <= set to True if you want to drop exact duplicates
if DO_DEDUP:
    resumes = resumes.loc[~dup_mask].reset_index(drop=True)
    print("After dedup:", resumes.shape)


Exact duplicate rows (by text_clean): 797


In [16]:
# ==============================
# STEP 4 — Stopword Removal -> processed_text
# ==============================
# We remove common English stopwords to emphasize skills/role terms.
# (TF-IDF also downweights frequent words, but this helps sharpen vocabulary.)

import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
STOP = set(stopwords.words("english"))

proc_texts = []
for s in resumes["text_clean"].astype(str).tolist():
    words = [w for w in s.split() if w not in STOP]
    proc_texts.append(" ".join(words))

resumes["processed_text"] = proc_texts
print("Processed text preview:")
resumes[["resume_id","category","processed_text"]].head(3)

# Save checkpoint with processed_text
resumes.to_csv(PROC_DIR + r"\resumes_with_processed_text.csv", index=False)
print("Saved:", PROC_DIR + r"\resumes_with_processed_text.csv")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saira\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processed text preview:
Saved: D:\Projects\ResumeJobRecommender\data\processed\resumes_with_processed_text.csv


In [17]:
# ==============================
# STEP 5 — TF-IDF Vectorization (unigram + bigram)
# ==============================
# WHAT: Convert resumes into a sparse TF-IDF matrix (rows = resumes, cols = terms).
# WHY: This is the numeric representation we'll later compare with job postings via cosine similarity.
# NOTES:

#  - We fit on resume text now, and we will use the SAME vocabulary & IDF to vectorize jobs later.
#  - We save only the essential artifacts you actually need downstream.

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import json
import numpy as np
import pandas as pd

# ---- 5.1 Configure the vectorizer
# ngram_range=(1,2): capture single words and 2-word phrases ("data science", "project management")
# min_df=3: drop super-rare tokens (noise)
# max_df=0.90: drop overly common tokens (boilerplate)
# max_features=50k: cap vocabulary to keep memory/speed reasonable
tfidf = TfidfVectorizer(
    lowercase=False,          # already lowercased in text_clean
    analyzer="word",
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.90,
    max_features=50000
)

# ---- 5.2 Fit on processed_text and build the sparse matrix
X = tfidf.fit_transform(resumes["processed_text"].astype(str).tolist())
print("TF-IDF matrix shape:", X.shape)  # rows = resumes, cols = terms

# ---- 5.3 Persist ESSENTIAL artifacts
# (a) The sparse matrix: used when computing similarities later
sparse.save_npz(PROC_DIR + r"\resume_tfidf_matrix.npz", X)
print("Saved:", PROC_DIR + r"\resume_tfidf_matrix.npz")

# (b) Vocabulary (term -> column index)
#     IMPORTANT: cast NumPy int32 to Python int so JSON can serialize it.
vocab_np = tfidf.vocabulary_                 # dict: term -> numpy.int32
vocab_py = {term: int(idx) for term, idx in vocab_np.items()}   # <-- CAST HERE

with open(PROC_DIR + r"\resume_tfidf_vocab.json", "w", encoding="utf-8") as f:
    json.dump(vocab_py, f)                   # now JSON-serializable
print("Saved:", PROC_DIR + r"\resume_tfidf_vocab.json")

# (c) IDF weights (float array). We'll reload these to rebuild the same vectorizer for jobs.
np.save(PROC_DIR + r"\resume_tfidf_idf.npy", tfidf.idf_)
print("Saved:", PROC_DIR + r"\resume_tfidf_idf.npy")

# (d) Ordered term list — human-readable and handy for debugging/inspection
terms = tfidf.get_feature_names_out().tolist()
pd.DataFrame({"term": terms}).to_csv(PROC_DIR + r"\resume_tfidf_terms.csv", index=False)
print("Saved:", PROC_DIR + r"\resume_tfidf_terms.csv")

# (e) Row index mapping — links resume_id to matrix row index (critical for joins later)
resumes[["resume_id"]].assign(row_idx=np.arange(len(resumes))).to_csv(
    PROC_DIR + r"\resume_tfidf_rowindex.csv", index=False
)
print("Saved:", PROC_DIR + r"\resume_tfidf_rowindex.csv")


TF-IDF matrix shape: (2158, 50000)
Saved: D:\Projects\ResumeJobRecommender\data\processed\resume_tfidf_matrix.npz
Saved: D:\Projects\ResumeJobRecommender\data\processed\resume_tfidf_vocab.json
Saved: D:\Projects\ResumeJobRecommender\data\processed\resume_tfidf_idf.npy
Saved: D:\Projects\ResumeJobRecommender\data\processed\resume_tfidf_terms.csv
Saved: D:\Projects\ResumeJobRecommender\data\processed\resume_tfidf_rowindex.csv


#### Quick EDA

In [18]:
from scipy import sparse
import numpy as np
import pandas as pd

X = sparse.load_npz(r"D:\Projects\ResumeJobRecommender\data\processed\resume_tfidf_matrix.npz")
print("shape:", X.shape)

nnz = X.nnz
total = X.shape[0] * X.shape[1]
density = nnz / total
print("nonzeros:", nnz)
print("density (should be tiny):", density)
print("avg nonzero features per resume:", nnz / X.shape[0])


shape: (2158, 50000)
nonzeros: 967174
density (should be tiny): 0.008963614457831325
avg nonzero features per resume: 448.1807228915663


In [19]:
import json
terms = pd.read_csv(r"D:\Projects\ResumeJobRecommender\data\processed\resume_tfidf_terms.csv")["term"].tolist()
with open(r"D:\Projects\ResumeJobRecommender\data\processed\resume_tfidf_vocab.json","r",encoding="utf-8") as f:
    vocab = json.load(f)

print("first 20 terms:", terms[:20])
print("sample vocab lookups:", {k:vocab[k] for k in list(vocab)[:10]})


first 20 terms: ['00', '00 gpa', '00 per', '000', '000 00', '000 000', '000 additional', '000 annual', '000 annually', '000 employees', '000 lending', '000 million', '000 month', '000 monthly', '000 new', '000 people', '000 per', '000 platinum', '000 revenue', '000 sq']
sample vocab lookups: {'accountant': 2050, 'summary': 43686, 'medium': 27763, 'sized': 41263, 'experience': 17339, '01': 23, '2009': 927, 'current': 11286, 'name': 29244, 'ï1': 49985}


In [20]:
# pick a few random rows and show their strongest TF-IDF tokens
rng_rows = np.random.choice(X.shape[0], size=3, replace=False)
terms_arr = np.array(terms)

for i in rng_rows:
    row = X.getrow(i)
    idx, data = row.indices, row.data
    order = np.argsort(data)[::-1]
    top_idx = idx[order][:10]
    top_terms = terms_arr[top_idx]
    print(f"\nResume row {i} top terms:")
    print(", ".join(top_terms))



Resume row 1874 top terms:
configuration, network, cisco, servers, cisco router, server, r2, router, dhcp, switches

Resume row 1081 top terms:
sales, selling, ohio, sold, client list, billing, kentucky, territories, territory, inc

Resume row 1027 top terms:
social media, media, hostess, social, pr, name city, managed inventory, hootsuite, media platforms, write


In [21]:
map_df = pd.read_csv(r"D:\Projects\ResumeJobRecommender\data\processed\resume_tfidf_rowindex.csv")
print(map_df.head(5))
print("unique rows vs. matrix rows:", map_df["row_idx"].nunique(), "vs", X.shape[0])


    resume_id  row_idx
0  PDF_000001        0
1  PDF_000002        1
2  PDF_000003        2
3  PDF_000004        3
4  PDF_000005        4
unique rows vs. matrix rows: 2158 vs 2158


In [22]:
# approximate global frequency by counting non-zeros per column
col_nnz = np.diff(X.tocsc().indptr)  # fast nnz per column
top_cols = np.argsort(col_nnz)[::-1][:20]
print("Top 20 frequent terms overall:")
for j in top_cols:
    print(terms[j], int(col_nnz[j]))


Top 20 frequent terms overall:
skills 1715
management 1546
university 1458
team 1451
experience 1427
state 1349
name 1349
new 1271
project 1256
business 1238
company name 1208
work 1206
development 1176
city 1171
skill 1145
city state 1125
system 1104
customer 1089
details 1065
information 1062
