In [10]:
# ==============================
# NOTEBOOK 4 — Job TF-IDF Representation
# ==============================
# GOAL:
# Transform cleaned job descriptions into TF-IDF vectors
# using the same vocabulary and IDF values learned from resumes.


In [11]:
# ==============================
# STEP 0 — Setup & Imports
# ==============================
BASE_DIR = r"D:\Projects\ResumeJobRecommender"
PROC_DIR = BASE_DIR + r"\data\processed"

import os, json, re
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords', quiet=True)

pd.set_option("display.max_colwidth", 150)
print("PROC_DIR:", PROC_DIR)


PROC_DIR: D:\Projects\ResumeJobRecommender\data\processed


In [12]:
# ==============================
# STEP 1 — Load cleaned job postings
# ==============================
job_path = PROC_DIR + r"\job_postings_cleaned.csv"
job_df = pd.read_csv(job_path)
print("Jobs loaded:", job_df.shape)
print(job_df.columns.tolist())

# keep only required text columns
job_texts = job_df["job_description_clean"].astype(str).tolist()


Jobs loaded: (15851, 15)
['job_id', 'job_title', 'job_description', 'location', 'experience_level', 'work_type', 'min_salary', 'max_salary', 'pay_period', 'currency', 'remote_allowed', 'sponsored', 'job_posting_url', 'job_title_clean', 'job_description_clean']


In [13]:
# ==============================
# STEP 2 — Load resume TF-IDF vocab + IDF
# ==============================
vocab_path = PROC_DIR + r"\resume_tfidf_vocab.json"
idf_path   = PROC_DIR + r"\resume_tfidf_idf.npy"

with open(vocab_path, "r", encoding="utf-8") as f:
    vocab = json.load(f)

idf_values = np.load(idf_path)

print("Loaded vocab terms:", len(vocab))
print("Loaded IDF shape:", idf_values.shape)


Loaded vocab terms: 50000
Loaded IDF shape: (50000,)


In [14]:
# ==============================
# STEP 3 — Light token cleanup & stopword removal
# ==============================
stop_words = set(stopwords.words("english"))

def preprocess_text(s):
    s = str(s).lower()
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    s = " ".join([w for w in s.split() if w not in stop_words])
    return s.strip()

job_df["job_text_processed"] = job_df["job_description_clean"].apply(preprocess_text)


In [15]:

# ==============================
# STEP 4 — Build TF-IDF for jobs using fixed resume vocab/IDF (version-safe)
# ==============================
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.sparse import spdiags

# 1) Count vectors with the SAME vocab (and n-gram setting used for resumes)
count_vect = CountVectorizer(
    vocabulary=vocab,       # fixed mapping term -> column index (from resumes)
    lowercase=False,        # we already lowercased
    analyzer="word",
    ngram_range=(1, 2)      # match resume vectorizer setting
)
job_counts = count_vect.transform(job_df["job_text_processed"])

# 2) Apply the SAME IDF weights learned on resumes
tfidf_transformer = TfidfTransformer(norm="l2", use_idf=True, smooth_idf=False, sublinear_tf=False)
tfidf_transformer.idf_ = idf_values
# important for newer sklearn: build the internal diagonal
tfidf_transformer._idf_diag = spdiags(tfidf_transformer.idf_, diags=0,
                                      m=len(tfidf_transformer.idf_), n=len(tfidf_transformer.idf_))

job_tfidf = tfidf_transformer.transform(job_counts)
print("Job TF-IDF shape:", job_tfidf.shape)

Job TF-IDF shape: (15851, 50000)


In [16]:
# ==============================
# STEP 5 — QA: Sparsity & content checks
# ==============================
nonzeros = job_tfidf.nnz
density = nonzeros / np.prod(job_tfidf.shape)
avg_feats = nonzeros / job_tfidf.shape[0]

print(f"Nonzeros: {nonzeros}")
print(f"Density: {density:.6f}")
print(f"Avg nonzero features per job: {avg_feats:.2f}")

terms = vectorizer.get_feature_names_out()
print("First 20 terms:", terms[:20])

# example: show top 10 terms for one random job
row_idx = np.random.randint(0, job_tfidf.shape[0])
row_vector = job_tfidf.getrow(row_idx).toarray().ravel()
top_indices = row_vector.argsort()[-10:][::-1]
print(f"\nJob row {row_idx} top terms:")
print([terms[i] for i in top_indices])


Nonzeros: 4221263
Density: 0.005326
Avg nonzero features per job: 266.31
First 20 terms: ['00' '00 gpa' '00 per' '000' '000 00' '000 000' '000 additional'
 '000 annual' '000 annually' '000 employees' '000 lending' '000 million'
 '000 month' '000 monthly' '000 new' '000 people' '000 per' '000 platinum'
 '000 revenue' '000 sq']

Job row 6060 top terms:
['procurement', 'head', 'function', 'acquisitions', 'margins', 'must', 'across', 'supplier', 'expected', 'enable']


In [18]:
# ==============================
# STEP 6 — Save TF-IDF artifacts
# ==============================
# matrix
matrix_path = PROC_DIR + r"\job_tfidf_matrix.npz"
sparse.save_npz(matrix_path, job_tfidf)

# row index map
rowindex_path = PROC_DIR + r"\job_tfidf_rowindex.csv"
job_df[["job_id"]].reset_index(drop=True).reset_index().rename(
    columns={"index":"row_idx"}
).to_csv(rowindex_path, index=False)

print("Saved:", matrix_path)
print("Saved:", rowindex_path)


Saved: D:\Projects\ResumeJobRecommender\data\processed\job_tfidf_matrix.npz
Saved: D:\Projects\ResumeJobRecommender\data\processed\job_tfidf_rowindex.csv


In [20]:
import scipy.sparse as sp, numpy as np
from scipy import sparse

X_res = sparse.load_npz(r"D:\Projects\ResumeJobRecommender\data\processed\resume_tfidf_matrix.npz")
print(X_res.shape[1], job_tfidf.shape[1])  # both should be 50000


50000 50000
