In [1]:
# ==============================
# NOTEBOOK 5 — Job Recommender models
# ==============================


In [21]:
# ==============================
# STEP 0 — Setup & Imports
# ==============================
BASE_DIR = r"D:\Projects\ResumeJobRecommender"
PROC_DIR = BASE_DIR + r"\data\processed"

import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

os.makedirs(PROC_DIR, exist_ok=True)
pd.set_option("display.max_colwidth", 180)

TOP_N = 100      # no. of jobs to return per resume
BATCH = 2000    # job-batch size
print("PROC_DIR:", PROC_DIR, "| TOP_N:", TOP_N, "| BATCH:", BATCH)


PROC_DIR: D:\Projects\ResumeJobRecommender\data\processed | TOP_N: 100 | BATCH: 2000


In [22]:
# ==============================
# STEP 1 — Load artifacts
# ==============================
# TF-IDF matrices
X_res = sparse.load_npz(PROC_DIR + r"\resume_tfidf_matrix.npz")
X_job = sparse.load_npz(PROC_DIR + r"\job_tfidf_matrix.npz")
print("X_res shape:", X_res.shape)  
print("X_job shape:", X_job.shape)  

# Row index maps
res_map = pd.read_csv(PROC_DIR + r"\resume_tfidf_rowindex.csv")   
job_map = pd.read_csv(PROC_DIR + r"\job_tfidf_rowindex.csv")      
print("res_map:", res_map.shape, "| job_map:", job_map.shape)

# Job metadata 
jobs_meta = pd.read_csv(PROC_DIR + r"\job_postings_cleaned.csv")
keep_cols = [c for c in ["job_id","job_title","job_posting_url"] if c in jobs_meta.columns]
jobs_meta = jobs_meta[keep_cols].drop_duplicates(subset=["job_id"]).copy()
print("jobs_meta:", jobs_meta.shape, "| cols:", jobs_meta.columns.tolist())

assert X_res.shape[1] == X_job.shape[1], "Resume and Job matrices must have the same number of columns (vocab)."


X_res shape: (2158, 50000)
X_job shape: (15851, 50000)
res_map: (2158, 2) | job_map: (15851, 2)
jobs_meta: (15851, 3) | cols: ['job_id', 'job_title', 'job_posting_url']


### Model-1: TF-IDF-Cosine Similarity

In [23]:
# ==============================
# STEP 2 — Cosine similarity (batched) → Top-N per resume
# ==============================
n_res, n_jobs = X_res.shape[0], X_job.shape[0]

# store best N scores/indices per resume as we scan job batches
best_scores = np.full((n_res, TOP_N), -np.inf, dtype=np.float32)
best_index  = np.full((n_res, TOP_N), -1, dtype=np.int32)

start = 0
while start < n_jobs:
    end = min(start + BATCH, n_jobs)
    # cosine similarity block: (n_res x (end-start))
    block = cosine_similarity(X_res, X_job[start:end])
    
    # update per-resume top-N
    for r in range(n_res):
        row = block[r]
        # candidate top indices within this block
        if TOP_N >= row.size:
            top_idx = np.argsort(-row)  # all
        else:
            top_part = np.argpartition(-row, TOP_N)[:TOP_N]
            top_idx = top_part[np.argsort(-row[top_part])]
        cand_scores = row[top_idx]
        cand_idx    = start + top_idx  # convert to global job indices

        # merge with existing best
        all_scores = np.concatenate([best_scores[r], cand_scores])
        all_idx    = np.concatenate([best_index[r],  cand_idx])
        if TOP_N >= all_scores.size:
            sel = np.argsort(-all_scores)
        else:
            sel_part = np.argpartition(-all_scores, TOP_N)[:TOP_N]
            sel = sel_part[np.argsort(-all_scores[sel_part])]
        best_scores[r] = all_scores[sel]
        best_index[r]  = all_idx[sel]
    
    print(f"Processed jobs {start}..{end-1}")
    start = end

rows = []
for r in range(n_res):
    for k in range(TOP_N):
        j = int(best_index[r, k])
        s = float(best_scores[r, k])
        rows.append([r, j, s])

cosine_df = pd.DataFrame(rows, columns=["resume_row","job_row","score_cosine"])
print("cosine_df shape:", cosine_df.shape)
cosine_df.head(3)


Processed jobs 0..1999
Processed jobs 2000..3999
Processed jobs 4000..5999
Processed jobs 6000..7999
Processed jobs 8000..9999
Processed jobs 10000..11999
Processed jobs 12000..13999
Processed jobs 14000..15850
cosine_df shape: (215800, 3)


Unnamed: 0,resume_row,job_row,score_cosine
0,0,14363,0.197336
1,0,46,0.180194
2,0,11615,0.179163


In [24]:
print("block shape:", block.shape)


block shape: (2158, 1851)


In [25]:
print(X_res.shape)
cosine_df.head(10)

(2158, 50000)


Unnamed: 0,resume_row,job_row,score_cosine
0,0,14363,0.197336
1,0,46,0.180194
2,0,11615,0.179163
3,0,11529,0.173348
4,0,4093,0.164817
5,0,9029,0.163207
6,0,5227,0.160355
7,0,6085,0.159319
8,0,606,0.155456
9,0,4130,0.154237


In [26]:
# ==============================
# STEP 3 — Attach IDs, titles, URLs, and rank
# ==============================
# Map row indices back to IDs
out = cosine_df.merge(res_map, left_on="resume_row", right_on="row_idx", how="left") \
               .merge(job_map,   left_on="job_row",   right_on="row_idx", how="left",
                      suffixes=("_res","_job"))

# Keep only needed columns
out = out.rename(columns={"resume_id":"resume_id", "job_id":"job_id"})[["resume_id","job_id","score_cosine"]]

# Add job metadata
out = out.merge(jobs_meta, on="job_id", how="left")

# Rank within each resume by score 
out["rank"] = out.groupby("resume_id")["score_cosine"].rank(method="first", ascending=False).astype(int)

# Order columns
out = out[["resume_id","rank","job_id","job_title","job_posting_url","score_cosine"]].sort_values(["resume_id","rank"])
print("Final cosine recommendations:", out.shape)
display(out.head(10))


Final cosine recommendations: (215800, 6)


Unnamed: 0,resume_id,rank,job_id,job_title,job_posting_url,score_cosine
119600,IT_000001,1,3697341474,Data Scientist,https://www.linkedin.com/jobs/view/3697341474/?trk=jobs_biz_prem_srch,0.150907
119601,IT_000001,2,3694121166,Natural Language Processing (NLP) Data Scientist.,https://www.linkedin.com/jobs/view/3694121166/?trk=jobs_biz_prem_srch,0.147311
119602,IT_000001,3,3693044920,Splunk Developer,https://www.linkedin.com/jobs/view/3693044920/?trk=jobs_biz_prem_srch,0.146477
119603,IT_000001,4,3699082085,Special Investigations Unit (SIU) Investigator-1 year healthcare fraud experience,https://www.linkedin.com/jobs/view/3699082085/?trk=jobs_biz_prem_srch,0.141117
119604,IT_000001,5,3693584453,Senior Data Scientist,https://www.linkedin.com/jobs/view/3693584453/?trk=jobs_biz_prem_srch,0.140981
119605,IT_000001,6,3693581630,Senior Data Scientist,https://www.linkedin.com/jobs/view/3693581630/?trk=jobs_biz_prem_srch,0.140981
119606,IT_000001,7,3693598848,Digital media fraud expert,https://www.linkedin.com/jobs/view/3693598848/?trk=jobs_biz_prem_srch,0.134763
119607,IT_000001,8,3701198711,Mentor - Machine Learning Career Track (Part-time/Contract),https://www.linkedin.com/jobs/view/3701198711/?trk=jobs_biz_prem_srch,0.131573
119608,IT_000001,9,3693049684,Senior Manager,https://www.linkedin.com/jobs/view/3693049684/?trk=jobs_biz_prem_srch,0.130229
119609,IT_000001,10,3693056342,"Senior Backend Software Engineer, Data",https://www.linkedin.com/jobs/view/3693056342/?trk=jobs_biz_prem_srch,0.128884


In [27]:
# ==============================
# STEP 4 — Save results
# ==============================
save_path = PROC_DIR + r"\matches_tfidf_cosine.csv"
out.to_csv(save_path, index=False)
print(" Cosine Sim data Saved:", save_path)


 Cosine Sim data Saved: D:\Projects\ResumeJobRecommender\data\processed\matches_tfidf_cosine.csv


In [28]:
# ==============================
# STEP 5 — Displaying data
# ==============================

print("\nScore summary:")
print(out["score_cosine"].describe())

# Load resume labels (category) from resume_cleaned.csv
resume_label = {}
try:
    resume_meta = pd.read_csv(PROC_DIR + r"\resume_cleaned.csv",
                              usecols=["resume_id", "category"])
    resume_label = dict(zip(resume_meta["resume_id"], resume_meta["category"]))
    print("Loaded resume labels from resume_cleaned.csv")
except Exception as e:
    print("Could not load resume labels from resume_cleaned.csv; using '(unknown)'.")
    print("Error was:", e)

def show_top_matches(resume_id, top_k=10):
    """Print header with category label and show top-k jobs (no resume_id column)."""
    label = resume_label.get(resume_id, "(unknown)")
    print(f"\nTop matches for {resume_id} — '{label}':")
    df = (out[out["resume_id"] == resume_id]
          .sort_values("rank")
          .head(top_k)[["rank", "job_title", "job_posting_url", "score_cosine"]]
          .rename(columns={"score_cosine": "cosine_score"}))
    display(df)

# Example: two random resumes
for rid in out["resume_id"].drop_duplicates().sample(2, random_state=42):
    show_top_matches(rid, top_k=10)




Score summary:
count    215800.000000
mean          0.128337
std           0.055578
min           0.000000
25%           0.086764
50%           0.126046
75%           0.162360
max           0.599797
Name: score_cosine, dtype: float64
Loaded resume labels from resume_cleaned.csv

Top matches for PDF_000036 — 'ACCOUNTANT':


Unnamed: 0,rank,job_title,job_posting_url,cosine_score
3500,1,Senior Accountant,https://www.linkedin.com/jobs/view/3701317370/?trk=jobs_biz_prem_srch,0.250362
3501,2,Controller,https://www.linkedin.com/jobs/view/3701302783/?trk=jobs_biz_prem_srch,0.22263
3502,3,Senior Accountant,https://www.linkedin.com/jobs/view/3693043970/?trk=jobs_biz_prem_srch,0.217808
3503,4,Finance Manager / Controller,https://www.linkedin.com/jobs/view/3326588170/?trk=jobs_biz_prem_srch,0.211776
3504,5,Assistant Controller,https://www.linkedin.com/jobs/view/3701319219/?trk=jobs_biz_prem_srch,0.211627
3505,6,Senior Technical Accounting Specialist,https://www.linkedin.com/jobs/view/3697389157/?trk=jobs_biz_prem_srch,0.208189
3506,7,Controller,https://www.linkedin.com/jobs/view/3693075211/?trk=jobs_biz_prem_srch,0.205926
3507,8,Accounting Manager,https://www.linkedin.com/jobs/view/3701321533/?trk=jobs_biz_prem_srch,0.202339
3508,9,Accounting Clerk,https://www.linkedin.com/jobs/view/3693050858/?trk=jobs_biz_prem_srch,0.200532
3509,10,Accounting Assistant,https://www.linkedin.com/jobs/view/3693049719/?trk=jobs_biz_prem_srch,0.20019



Top matches for IT_000362 — 'Java Developer':


Unnamed: 0,rank,job_title,job_posting_url,cosine_score
155700,1,Java Developer (Only W2),https://www.linkedin.com/jobs/view/3693044444/?trk=jobs_biz_prem_srch,0.203076
155701,2,Java Software Engineer,https://www.linkedin.com/jobs/view/3699431844/?trk=jobs_biz_prem_srch,0.155274
155702,3,Java Consultant,https://www.linkedin.com/jobs/view/3693047709/?trk=jobs_biz_prem_srch,0.145052
155703,4,Senior Test Engineer with ISTQB,https://www.linkedin.com/jobs/view/3693049352/?trk=jobs_biz_prem_srch,0.140697
155704,5,java developer W2,https://www.linkedin.com/jobs/view/3693046636/?trk=jobs_biz_prem_srch,0.133923
155705,6,Java Software Engineer,https://www.linkedin.com/jobs/view/3693044773/?trk=jobs_biz_prem_srch,0.133744
155706,7,Java Software Engineer,https://www.linkedin.com/jobs/view/3693067545/?trk=jobs_biz_prem_srch,0.133744
155707,8,Senior Associate Software Engineer,https://www.linkedin.com/jobs/view/3701366243/?trk=jobs_biz_prem_srch,0.118695
155708,9,Information Technology Operations Specialist,https://www.linkedin.com/jobs/view/3699086528/?trk=jobs_biz_prem_srch,0.115492
155709,10,Java Developer - W2 ONLY,https://www.linkedin.com/jobs/view/3701323681/?trk=jobs_biz_prem_srch,0.106506


### Model-2: TF-IDF Euclidean

In [29]:
# ============================================
# MODEL 2 — TF-IDF + Euclidean similarity
# ============================================

from sklearn.metrics import pairwise_distances
import numpy as np

TOP_K = 100        
BLOCK_SIZE = 2000  

euclid_rows = []

print("\n=== MODEL 2: TF-IDF + Euclidean similarity ===")
print("X_res:", X_res.shape, "| X_job:", X_job.shape)

for j_start in range(0, X_job.shape[0], BLOCK_SIZE):
    j_end = min(j_start + BLOCK_SIZE, X_job.shape[0])
    X_job_block = X_job[j_start:j_end]

    print(f"Processed jobs {j_start}..{j_end-1}")

    # Distances: shape = (n_resumes, block_size)
    dist_block = pairwise_distances(
        X_res,
        X_job_block,
        metric="euclidean",
        n_jobs=-1
    )

    # Convert distance -> similarity (closer = higher)
    # similarity in (0, 1]; distance 0 -> sim 1
    sim_block = 1.0 / (1.0 + dist_block)

    # For each resume, keep top_k within this block
    n_res, n_block_jobs = sim_block.shape
    for res_idx in range(n_res):
        row_sim = sim_block[res_idx]

        # If row is all zeros, skip
        if np.all(row_sim == 0):
            continue

        # indices of top_k highest similarities in this block
        if n_block_jobs <= TOP_K:
            top_local_idx = np.argsort(-row_sim)  # descending
        else:
            # argpartition for efficiency, then sort those
            top_local_idx = np.argpartition(-row_sim, TOP_K - 1)[:TOP_K]
            top_local_idx = top_local_idx[np.argsort(-row_sim[top_local_idx])]

        # Collect rows (global job index = j_start + local index)
        for j_local in top_local_idx:
            euclid_rows.append((
                res_idx,                 # resume_row
                j_start + j_local,       # job_row
                float(row_sim[j_local])  # score_euclid
            ))



=== MODEL 2: TF-IDF + Euclidean similarity ===
X_res: (2158, 50000) | X_job: (15851, 50000)
Processed jobs 0..1999
Processed jobs 2000..3999
Processed jobs 4000..5999
Processed jobs 6000..7999
Processed jobs 8000..9999
Processed jobs 10000..11999
Processed jobs 12000..13999
Processed jobs 14000..15850


In [30]:
# --------------------------------------------
# STEP 2 — Build euclid_df and keep global top-k per resume
# --------------------------------------------
euclid_df = pd.DataFrame(euclid_rows,
                         columns=["resume_row", "job_row", "score_euclid"])

print("Raw euclid_df shape (before global top-k):", euclid_df.shape)

# Keep overall top_k per resume across all blocks
euclid_df["tmp_rank"] = euclid_df.groupby("resume_row")["score_euclid"].rank(
    method="first", ascending=False
)

euclid_df = euclid_df[euclid_df["tmp_rank"] <= TOP_K].drop(columns=["tmp_rank"])
print("euclid_df shape (after global top-k):", euclid_df.shape)

Raw euclid_df shape (before global top-k): (1726400, 3)
euclid_df shape (after global top-k): (215800, 3)


In [31]:
# ==============================
# STEP 3 — Attach IDs, titles, URLs, and rank (Euclidean)
# ==============================

out_euclid = euclid_df.merge(
    res_map, left_on="resume_row", right_on="row_idx", how="left"
).merge(
    job_map, left_on="job_row", right_on="row_idx", how="left",
    suffixes=("_res", "_job")
)

# Keep only needed ID and score columns
out_euclid = out_euclid.rename(
    columns={"resume_id": "resume_id", "job_id": "job_id"}
)[["resume_id", "job_id", "score_euclid"]]

# Add job metadata
out_euclid = out_euclid.merge(jobs_meta, on="job_id", how="left")

# Rank within each resume by Euclidean similarity
out_euclid["rank"] = out_euclid.groupby("resume_id")["score_euclid"].rank(
    method="first", ascending=False
).astype(int)

# Order columns
out_euclid = out_euclid[
    ["resume_id", "rank", "job_id", "job_title", "job_posting_url", "score_euclid"]
].sort_values(["resume_id", "rank"])

print("Final Euclidean recommendations:", out_euclid.shape)
display(out_euclid.head(10))



Final Euclidean recommendations: (215800, 6)


Unnamed: 0,resume_id,rank,job_id,job_title,job_posting_url,score_euclid
110593,IT_000001,1,3694158424,LinkedIn Test Vacancy 24/08/2023,https://www.linkedin.com/jobs/view/3694158424/?trk=jobs_biz_prem_srch,0.5
110594,IT_000001,2,3697341474,Data Scientist,https://www.linkedin.com/jobs/view/3697341474/?trk=jobs_biz_prem_srch,0.434189
110595,IT_000001,3,3694121166,Natural Language Processing (NLP) Data Scientist.,https://www.linkedin.com/jobs/view/3694121166/?trk=jobs_biz_prem_srch,0.43367
18715,IT_000001,4,3693044920,Splunk Developer,https://www.linkedin.com/jobs/view/3693044920/?trk=jobs_biz_prem_srch,0.43355
155435,IT_000001,5,3699082085,Special Investigations Unit (SIU) Investigator-1 year healthcare fraud experience,https://www.linkedin.com/jobs/view/3699082085/?trk=jobs_biz_prem_srch,0.432781
83166,IT_000001,6,3693581630,Senior Data Scientist,https://www.linkedin.com/jobs/view/3693581630/?trk=jobs_biz_prem_srch,0.432762
83167,IT_000001,7,3693584453,Senior Data Scientist,https://www.linkedin.com/jobs/view/3693584453/?trk=jobs_biz_prem_srch,0.432762
83168,IT_000001,8,3693598848,Digital media fraud expert,https://www.linkedin.com/jobs/view/3693598848/?trk=jobs_biz_prem_srch,0.431877
178198,IT_000001,9,3701198711,Mentor - Machine Learning Career Track (Part-time/Contract),https://www.linkedin.com/jobs/view/3701198711/?trk=jobs_biz_prem_srch,0.431426
18716,IT_000001,10,3693049684,Senior Manager,https://www.linkedin.com/jobs/view/3693049684/?trk=jobs_biz_prem_srch,0.431236


In [32]:
# ==============================
# STEP 4 — Save results (Euclidean)
# ==============================
save_path_euclid = PROC_DIR + r"\matches_tfidf_euclid.csv"
out_euclid.to_csv(save_path_euclid, index=False)
print(" Euclidean Sim data Saved:", save_path_euclid)


 Euclidean Sim data Saved: D:\Projects\ResumeJobRecommender\data\processed\matches_tfidf_euclid.csv


In [33]:
# ==============================
# STEP 5 — Results - Euclidean
# ==============================

print("\nEuclidean similarity score summary:")
print(out_euclid["score_euclid"].describe())


def show_top_matches_euclid(resume_id, top_k=10):
    """Top-k jobs by Euclidean similarity for a given resume_id."""
    label = resume_label.get(resume_id, "(unknown)")
    print(f"\n[Euclidean] Top matches for {resume_id} — '{label}':")
    df = (
        out_euclid[out_euclid["resume_id"] == resume_id]
        .sort_values("rank")
        .head(top_k)[["rank", "job_title", "job_posting_url", "score_euclid"]]
        .rename(columns={"score_euclid": "euclid_score"})
    )
    df["euclid_score"] = df["euclid_score"].round(3)
    display(df)

# Recommendations for two random resumes
for rid in out["resume_id"].drop_duplicates().sample(2):
    show_top_matches_euclid(rid, top_k=10)



Euclidean similarity score summary:
count    215800.000000
mean          0.432006
std           0.010685
min           0.417025
25%           0.425364
50%           0.430786
75%           0.436108
max           1.000000
Name: score_euclid, dtype: float64

[Euclidean] Top matches for IT_000856 — 'Blockchain':


Unnamed: 0,rank,job_title,job_posting_url,euclid_score
120639,1,LinkedIn Test Vacancy 24/08/2023,https://www.linkedin.com/jobs/view/3694158424/?trk=jobs_biz_prem_srch,0.5
34913,2,Java Software Engineer,https://www.linkedin.com/jobs/view/3693044773/?trk=jobs_biz_prem_srch,0.433
64939,3,Java Software Engineer,https://www.linkedin.com/jobs/view/3693067545/?trk=jobs_biz_prem_srch,0.433
213927,4,Senior Marketing Communications Manager,https://www.linkedin.com/jobs/view/3701372766/?trk=jobs_biz_prem_srch,0.431
213928,5,Full Stack Engineer,https://www.linkedin.com/jobs/view/3701322395/?trk=jobs_biz_prem_srch,0.429
64940,6,Java Script Developer,https://www.linkedin.com/jobs/view/3693063901/?trk=jobs_biz_prem_srch,0.429
187556,7,Web Application Developer Intern (Unpaid),https://www.linkedin.com/jobs/view/3701308126/?trk=jobs_biz_prem_srch,0.428
64941,8,Software Engineer,https://www.linkedin.com/jobs/view/3693069022/?trk=jobs_biz_prem_srch,0.428
34914,9,Full Stack Developer (Java and Angular),https://www.linkedin.com/jobs/view/3693044508/?trk=jobs_biz_prem_srch,0.428
34915,10,Java financial application developer,https://www.linkedin.com/jobs/view/3693048211/?trk=jobs_biz_prem_srch,0.427



[Euclidean] Top matches for PDF_000425 — 'DIGITAL-MEDIA':


Unnamed: 0,rank,job_title,job_posting_url,euclid_score
100580,1,LinkedIn Test Vacancy 24/08/2023,https://www.linkedin.com/jobs/view/3694158424/?trk=jobs_biz_prem_srch,0.5
42046,2,Senior Animator [72220],https://www.linkedin.com/jobs/view/3693052308/?trk=jobs_biz_prem_srch,0.47
126717,3,Senior Gameplay Animator - XDefiant,https://www.linkedin.com/jobs/view/3697396559/?trk=jobs_biz_prem_srch,0.436
72161,4,3D Character Animator,https://www.linkedin.com/jobs/view/3693599396/?trk=jobs_biz_prem_srch,0.435
6477,5,"UX Designer (Motion Design) for Global Computer Technology Company in Redmond, WA",https://www.linkedin.com/jobs/view/3693047713/?trk=jobs_biz_prem_srch,0.434
100581,6,Adobe Experience Manager,https://www.linkedin.com/jobs/view/3697358122/?trk=jobs_biz_prem_srch,0.434
6478,7,Package Designer,https://www.linkedin.com/jobs/view/3693044420/?trk=jobs_biz_prem_srch,0.434
146646,8,"Head of Art, Games",https://www.linkedin.com/jobs/view/3699088403/?trk=jobs_biz_prem_srch,0.433
72162,9,AEM Technical Team Lead and Architect,https://www.linkedin.com/jobs/view/3693588018/?trk=jobs_biz_prem_srch,0.432
42047,10,Presentation Specialist,https://www.linkedin.com/jobs/view/3693070393/?trk=jobs_biz_prem_srch,0.431


In [34]:
show_top_matches("IT_000362", top_k=10)          # cosine 
show_top_matches_euclid("IT_000362", top_k=10)   # euclidean


Top matches for IT_000362 — 'Java Developer':


Unnamed: 0,rank,job_title,job_posting_url,cosine_score
155700,1,Java Developer (Only W2),https://www.linkedin.com/jobs/view/3693044444/?trk=jobs_biz_prem_srch,0.203076
155701,2,Java Software Engineer,https://www.linkedin.com/jobs/view/3699431844/?trk=jobs_biz_prem_srch,0.155274
155702,3,Java Consultant,https://www.linkedin.com/jobs/view/3693047709/?trk=jobs_biz_prem_srch,0.145052
155703,4,Senior Test Engineer with ISTQB,https://www.linkedin.com/jobs/view/3693049352/?trk=jobs_biz_prem_srch,0.140697
155704,5,java developer W2,https://www.linkedin.com/jobs/view/3693046636/?trk=jobs_biz_prem_srch,0.133923
155705,6,Java Software Engineer,https://www.linkedin.com/jobs/view/3693044773/?trk=jobs_biz_prem_srch,0.133744
155706,7,Java Software Engineer,https://www.linkedin.com/jobs/view/3693067545/?trk=jobs_biz_prem_srch,0.133744
155707,8,Senior Associate Software Engineer,https://www.linkedin.com/jobs/view/3701366243/?trk=jobs_biz_prem_srch,0.118695
155708,9,Information Technology Operations Specialist,https://www.linkedin.com/jobs/view/3699086528/?trk=jobs_biz_prem_srch,0.115492
155709,10,Java Developer - W2 ONLY,https://www.linkedin.com/jobs/view/3701323681/?trk=jobs_biz_prem_srch,0.106506



[Euclidean] Top matches for IT_000362 — 'Java Developer':


Unnamed: 0,rank,job_title,job_posting_url,euclid_score
115167,1,LinkedIn Test Vacancy 24/08/2023,https://www.linkedin.com/jobs/view/3694158424/?trk=jobs_biz_prem_srch,0.5
24997,2,Java Developer (Only W2),https://www.linkedin.com/jobs/view/3693044444/?trk=jobs_biz_prem_srch,0.442
181798,3,Java Software Engineer,https://www.linkedin.com/jobs/view/3699431844/?trk=jobs_biz_prem_srch,0.435
24998,4,Java Consultant,https://www.linkedin.com/jobs/view/3693047709/?trk=jobs_biz_prem_srch,0.433
24999,5,Senior Test Engineer with ISTQB,https://www.linkedin.com/jobs/view/3693049352/?trk=jobs_biz_prem_srch,0.433
25000,6,java developer W2,https://www.linkedin.com/jobs/view/3693046636/?trk=jobs_biz_prem_srch,0.432
25001,7,Java Software Engineer,https://www.linkedin.com/jobs/view/3693044773/?trk=jobs_biz_prem_srch,0.432
56753,8,Java Software Engineer,https://www.linkedin.com/jobs/view/3693067545/?trk=jobs_biz_prem_srch,0.432
207339,9,Senior Associate Software Engineer,https://www.linkedin.com/jobs/view/3701366243/?trk=jobs_biz_prem_srch,0.43
158874,10,Information Technology Operations Specialist,https://www.linkedin.com/jobs/view/3699086528/?trk=jobs_biz_prem_srch,0.429


### Model-3 : Dot Product

In [35]:
# ============================================
# MODEL 3 — TF-IDF + Dot Product Similarity
# ============================================

import numpy as np

dot_rows = []

print("\n=== MODEL 3: TF-IDF + Dot Product ===")
print("X_res:", X_res.shape, "| X_job:", X_job.shape)

for j_start in range(0, X_job.shape[0], BLOCK_SIZE):
    j_end = min(j_start + BLOCK_SIZE, X_job.shape[0])
    X_job_block = X_job[j_start:j_end]

    print(f"Processed jobs {j_start}..{j_end-1}")

    # Dot product similarity
    sim_block = (X_res @ X_job_block.T).toarray()   # shape = (n_res, n_jobs_block)

    n_res, n_block_jobs = sim_block.shape

    for res_idx in range(n_res):
        row_sim = sim_block[res_idx]

        if np.all(row_sim == 0):
            continue

        # Top-k indices in this block
        if n_block_jobs <= TOP_K:
            top_local_idx = np.argsort(-row_sim)
        else:
            top_local_idx = np.argpartition(-row_sim, TOP_K - 1)[:TOP_K]
            top_local_idx = top_local_idx[np.argsort(-row_sim[top_local_idx])]

        for j_local in top_local_idx:
            dot_rows.append((
                res_idx,
                j_start + j_local,
                float(row_sim[j_local])
            ))


=== MODEL 3: TF-IDF + Dot Product ===
X_res: (2158, 50000) | X_job: (15851, 50000)
Processed jobs 0..1999
Processed jobs 2000..3999
Processed jobs 4000..5999
Processed jobs 6000..7999
Processed jobs 8000..9999
Processed jobs 10000..11999
Processed jobs 12000..13999
Processed jobs 14000..15850


In [36]:
# ---- Build dataframe ----
dot_df = pd.DataFrame(dot_rows, columns=["resume_row", "job_row", "score_dot"])
print("Raw dot_df shape:", dot_df.shape)

dot_df["tmp_rank"] = dot_df.groupby("resume_row")["score_dot"].rank(
    method="first", ascending=False
)
dot_df = dot_df[dot_df["tmp_rank"] <= TOP_K].drop(columns=["tmp_rank"])
print("dot_df shape after global top-k:", dot_df.shape)

Raw dot_df shape: (1725600, 3)
dot_df shape after global top-k: (215700, 3)


In [37]:
# ---- Attach metadata ----
out_dot = dot_df.merge(res_map, left_on="resume_row", right_on="row_idx", how="left") \
               .merge(job_map, left_on="job_row", right_on="row_idx", how="left",
                      suffixes=("_res","_job"))

out_dot = out_dot.rename(columns={"resume_id": "resume_id", "job_id": "job_id"}) \
                 [["resume_id", "job_id", "score_dot"]]

out_dot = out_dot.merge(jobs_meta, on="job_id", how="left")

out_dot["rank"] = out_dot.groupby("resume_id")["score_dot"].rank(
    method="first", ascending=False
).astype(int)

out_dot = out_dot[
    ["resume_id","rank","job_id","job_title","job_posting_url","score_dot"]
].sort_values(["resume_id","rank"])

print("Final dot-product recommendations:", out_dot.shape)
display(out_dot.head(10))

# ---- Save ----
save_path_dot = PROC_DIR + r"\matches_tfidf_dot.csv"
out_dot.to_csv(save_path_dot, index=False)
print("Dot-product Sim Saved:", save_path_dot)


def show_top_matches_dot(resume_id, top_k=10):
    label = resume_label.get(resume_id, "(unknown)")
    print(f"\n[Dot] Top matches for {resume_id} — '{label}':")
    df = (
        out_dot[out_dot["resume_id"] == resume_id]
        .sort_values("rank")
        .head(top_k)[["rank","job_title","job_posting_url","score_dot"]]
        .rename(columns={"score_dot":"dot_score"})
    )
    df["dot_score"] = df["dot_score"].round(6)
    display(df)


Final dot-product recommendations: (215700, 6)


Unnamed: 0,resume_id,rank,job_id,job_title,job_posting_url,score_dot
110326,IT_000001,1,3697341474,Data Scientist,https://www.linkedin.com/jobs/view/3697341474/?trk=jobs_biz_prem_srch,0.150907
110327,IT_000001,2,3694121166,Natural Language Processing (NLP) Data Scientist.,https://www.linkedin.com/jobs/view/3694121166/?trk=jobs_biz_prem_srch,0.147311
18833,IT_000001,3,3693044920,Splunk Developer,https://www.linkedin.com/jobs/view/3693044920/?trk=jobs_biz_prem_srch,0.146477
154732,IT_000001,4,3699082085,Special Investigations Unit (SIU) Investigator-1 year healthcare fraud experience,https://www.linkedin.com/jobs/view/3699082085/?trk=jobs_biz_prem_srch,0.141117
83825,IT_000001,5,3693581630,Senior Data Scientist,https://www.linkedin.com/jobs/view/3693581630/?trk=jobs_biz_prem_srch,0.140981
83826,IT_000001,6,3693584453,Senior Data Scientist,https://www.linkedin.com/jobs/view/3693584453/?trk=jobs_biz_prem_srch,0.140981
83827,IT_000001,7,3693598848,Digital media fraud expert,https://www.linkedin.com/jobs/view/3693598848/?trk=jobs_biz_prem_srch,0.134763
177733,IT_000001,8,3701198711,Mentor - Machine Learning Career Track (Part-time/Contract),https://www.linkedin.com/jobs/view/3701198711/?trk=jobs_biz_prem_srch,0.131573
18834,IT_000001,9,3693049684,Senior Manager,https://www.linkedin.com/jobs/view/3693049684/?trk=jobs_biz_prem_srch,0.130229
52176,IT_000001,10,3693056342,"Senior Backend Software Engineer, Data",https://www.linkedin.com/jobs/view/3693056342/?trk=jobs_biz_prem_srch,0.128884


Dot-product Sim Saved: D:\Projects\ResumeJobRecommender\data\processed\matches_tfidf_dot.csv


In [38]:
# Recommendations for two random resumes
for rid in out["resume_id"].drop_duplicates().sample(2):
    show_top_matches_dot(rid, top_k=10)



[Dot] Top matches for PDF_000287 — 'BUSINESS-DEVELOPMENT':


Unnamed: 0,rank,job_title,job_posting_url,dot_score
4342,1,Aerospace VP of Sales and Marketing,https://www.linkedin.com/jobs/view/3693048760/?trk=jobs_biz_prem_srch,0.23573
191640,2,Full Time Substance Abuse Counselor,https://www.linkedin.com/jobs/view/3701323409/?trk=jobs_biz_prem_srch,0.233024
124157,3,Director of Sales,https://www.linkedin.com/jobs/view/3699053997/?trk=jobs_biz_prem_srch,0.231032
144052,4,Executive Director Sales,https://www.linkedin.com/jobs/view/3699087242/?trk=jobs_biz_prem_srch,0.201005
144053,5,Market Area Sales Manager - Midwest,https://www.linkedin.com/jobs/view/3699413939/?trk=jobs_biz_prem_srch,0.200765
167318,6,National Sales Director (Community),https://www.linkedin.com/jobs/view/3701199851/?trk=jobs_biz_prem_srch,0.197128
144054,7,"District Sales Manager - North Austin, TX",https://www.linkedin.com/jobs/view/3699079086/?trk=jobs_biz_prem_srch,0.193575
99478,8,Head of Growth (NY Platform),https://www.linkedin.com/jobs/view/3694112276/?trk=jobs_biz_prem_srch,0.192588
124158,9,"RVP, Sales, SMB, CX",https://www.linkedin.com/jobs/view/3697388313/?trk=jobs_biz_prem_srch,0.183106
99479,10,Head of Sales and Marketing,https://www.linkedin.com/jobs/view/3694107242/?trk=jobs_biz_prem_srch,0.181706



[Dot] Top matches for PDF_000402 — 'CONSULTANT':


Unnamed: 0,rank,job_title,job_posting_url,dot_score
125466,1,Oracle Database Administrator (7121),https://www.linkedin.com/jobs/view/3697381432/?trk=jobs_biz_prem_srch,0.41751
6096,2,Oracle Database Administrator,https://www.linkedin.com/jobs/view/3693043851/?trk=jobs_biz_prem_srch,0.293091
72341,3,Oracle Database Administrator,https://www.linkedin.com/jobs/view/3693072016/?trk=jobs_biz_prem_srch,0.287589
72342,4,Oracle Finance Functional Consultant,https://www.linkedin.com/jobs/view/3693589038/?trk=jobs_biz_prem_srch,0.286598
42066,5,Oracle Database Developer,https://www.linkedin.com/jobs/view/3693057254/?trk=jobs_biz_prem_srch,0.286039
193079,6,Oracle Functional Consultant,https://www.linkedin.com/jobs/view/3701332199/?trk=jobs_biz_prem_srch,0.275377
42067,7,OIPA System Admin (Remote),https://www.linkedin.com/jobs/view/3693055172/?trk=jobs_biz_prem_srch,0.262264
6097,8,oracle SCM TECHNO CLOUD FUNCTIONAL,https://www.linkedin.com/jobs/view/3669525119/?trk=jobs_biz_prem_srch,0.251376
6098,9,Database Developer,https://www.linkedin.com/jobs/view/3693046734/?trk=jobs_biz_prem_srch,0.249416
168670,10,Senior Software Engineer (Oracle Cloud Team),https://www.linkedin.com/jobs/view/3701196834/?trk=jobs_biz_prem_srch,0.232769


In [39]:
print(out_dot.shape)

print(out_dot["score_dot"].describe())

print(show_top_matches_dot("IT_000362", 10))

(215700, 6)
count    215700.000000
mean          0.128396
std           0.055522
min           0.022878
25%           0.086828
50%           0.126069
75%           0.162376
max           0.599797
Name: score_dot, dtype: float64

[Dot] Top matches for IT_000362 — 'Java Developer':


Unnamed: 0,rank,job_title,job_posting_url,dot_score
25168,1,Java Developer (Only W2),https://www.linkedin.com/jobs/view/3693044444/?trk=jobs_biz_prem_srch,0.203076
181385,2,Java Software Engineer,https://www.linkedin.com/jobs/view/3699431844/?trk=jobs_biz_prem_srch,0.155274
25169,3,Java Consultant,https://www.linkedin.com/jobs/view/3693047709/?trk=jobs_biz_prem_srch,0.145052
25170,4,Senior Test Engineer with ISTQB,https://www.linkedin.com/jobs/view/3693049352/?trk=jobs_biz_prem_srch,0.140697
25171,5,java developer W2,https://www.linkedin.com/jobs/view/3693046636/?trk=jobs_biz_prem_srch,0.133923
25172,6,Java Software Engineer,https://www.linkedin.com/jobs/view/3693044773/?trk=jobs_biz_prem_srch,0.133744
57167,7,Java Software Engineer,https://www.linkedin.com/jobs/view/3693067545/?trk=jobs_biz_prem_srch,0.133744
207192,8,Senior Associate Software Engineer,https://www.linkedin.com/jobs/view/3701366243/?trk=jobs_biz_prem_srch,0.118695
158198,9,Information Technology Operations Specialist,https://www.linkedin.com/jobs/view/3699086528/?trk=jobs_biz_prem_srch,0.115492
207193,10,Java Developer - W2 ONLY,https://www.linkedin.com/jobs/view/3701323681/?trk=jobs_biz_prem_srch,0.106506


None
