### Model 4 - BM25

In [26]:
import os
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer

# ==============================
# CONFIG
# ==============================
BASE_DIR = r"D:\Projects\ResumeJobRecommender"
PROC_DIR = BASE_DIR + r"\data\processed"
TOP_K = 100
BLOCK_SIZE = 2000

# ==============================
# STEP 1 — Load resumes & jobs
# ==============================
resumes = pd.read_csv(os.path.join(PROC_DIR, "resume_cleaned.csv"))
job_df  = pd.read_csv(os.path.join(PROC_DIR, "job_postings_cleaned.csv"))

print("Resumes shape:", resumes.shape)
print(resumes.columns)
print("\nJobs shape:", job_df.shape)
print(job_df.columns)

# Text columns 
RES_TEXT_COL = "text_clean"
JOB_TITLE_COL = "job_title_clean"
JOB_DESC_COL  = "job_description_clean"

# Create a combined job text column: title + description
job_df["job_text_bm25"] = (
    job_df[JOB_TITLE_COL].fillna("").astype(str) + " " +
    job_df[JOB_DESC_COL].fillna("").astype(str)
)

# Quick sanity
print("\nSample resume text:", resumes[RES_TEXT_COL].iloc[0][:200])
print("\nSample job text:", job_df["job_text_bm25"].iloc[0][:200])

# ==============================
# STEP 2 — Row index maps & metadata
# ==============================
res_map = resumes[["resume_id"]].assign(row_idx=np.arange(len(resumes)))
job_map = job_df[["job_id"]].assign(row_idx=np.arange(len(job_df)))

jobs_meta = job_df[["job_id", "job_title", "job_posting_url"]]

# Resume label dict for pretty headers
resume_label = dict(zip(resumes["resume_id"], resumes["category"]))

# ==============================
# STEP 3 — Build CountVectorizer vocab (resumes + jobs)
# ==============================
print("\n=== BM25: Fitting CountVectorizer on resumes + jobs ===")

all_text = pd.concat(
    [
        resumes[RES_TEXT_COL].astype(str),
        job_df["job_text_bm25"].astype(str)
    ],
    ignore_index=True
)

count_vect = CountVectorizer(
    lowercase=False,
    analyzer="word",
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.90,
    max_features=50000
)

count_vect.fit(all_text)
vocab_size = len(count_vect.vocabulary_)
print("Vocab size:", vocab_size)

# Transform separately
X_res_counts = count_vect.transform(resumes[RES_TEXT_COL].astype(str))
X_job_counts = count_vect.transform(job_df["job_text_bm25"].astype(str))

print("X_res_counts shape:", X_res_counts.shape)  # (2158, V)
print("X_job_counts shape:", X_job_counts.shape)  # (15851, V)

# ==============================
# STEP 4 — Compute BM25 weights for job documents
# ==============================
print("\n=== BM25: Computing BM25 weights for job documents ===")

X_job_csr = X_job_counts.tocsr()
N_docs = X_job_csr.shape[0]

# Document length per job and average length
dl = X_job_csr.sum(axis=1).A1
avgdl = dl.mean()

# Document frequency per term
df = (X_job_csr > 0).astype(int).sum(axis=0).A1  # shape (V,)

# BM25 IDF
idf_bm25 = np.log(1.0 + (N_docs - df + 0.5) / (df + 0.5))

# BM25 parameters
k1 = 1.5
b = 0.75

rows, cols = X_job_csr.nonzero()
tf = X_job_csr.data

norm = tf + k1 * (1.0 - b + b * dl[rows] / avgdl)
bm25_data = idf_bm25[cols] * (tf * (k1 + 1.0) / norm)

X_job_bm25 = sp.csr_matrix((bm25_data, (rows, cols)), shape=X_job_csr.shape)
print("X_job_bm25 shape:", X_job_bm25.shape)

# ==============================
# STEP 5 — Model 4: BM25 scoring (jobs as docs, resumes as queries)
# ==============================
bm25_rows = []

print("\n=== MODEL 4: BM25 ===")
print("X_res_counts:", X_res_counts.shape, "| X_job_bm25:", X_job_bm25.shape)

for j_start in range(0, X_job_bm25.shape[0], BLOCK_SIZE):
    j_end = min(j_start + BLOCK_SIZE, X_job_bm25.shape[0])
    X_job_bm25_block = X_job_bm25[j_start:j_end]

    print(f"Processed jobs {j_start}..{j_end-1}")

    # Scores = query counts * BM25 doc weights^T
    # (n_resumes, V) x (V, block_jobs) -> (n_resumes, block_jobs)
    sim_block = (X_res_counts @ X_job_bm25_block.T).toarray()

    n_res, n_block_jobs = sim_block.shape

    for res_idx in range(n_res):
        row_sim = sim_block[res_idx]
        if np.all(row_sim == 0):
            continue

        if n_block_jobs <= TOP_K:
            top_local_idx = np.argsort(-row_sim)
        else:
            top_local_idx = np.argpartition(-row_sim, TOP_K - 1)[:TOP_K]
            top_local_idx = top_local_idx[np.argsort(-row_sim[top_local_idx])]

        for j_local in top_local_idx:
            bm25_rows.append((
                res_idx,
                j_start + j_local,
                float(row_sim[j_local])
            ))

bm25_df = pd.DataFrame(bm25_rows, columns=["resume_row", "job_row", "score_bm25"])
print("Raw bm25_df shape (before global top-k):", bm25_df.shape)

bm25_df["tmp_rank"] = bm25_df.groupby("resume_row")["score_bm25"].rank(
    method="first", ascending=False
)
bm25_df = bm25_df[bm25_df["tmp_rank"] <= TOP_K].drop(columns=["tmp_rank"])
print("bm25_df shape (after global top-k):", bm25_df.shape)

# ==============================
# STEP 6 — Attach IDs, titles, URLs + rank
# ==============================
out_bm25 = bm25_df.merge(
    res_map,
    left_on="resume_row",
    right_on="row_idx",
    how="left"
).merge(
    job_map,
    left_on="job_row",
    right_on="row_idx",
    how="left",
    suffixes=("_res", "_job")
)

out_bm25 = out_bm25.rename(
    columns={"resume_id": "resume_id", "job_id": "job_id"}
)[["resume_id", "job_id", "score_bm25"]]

out_bm25 = out_bm25.merge(jobs_meta, on="job_id", how="left")

out_bm25["rank"] = out_bm25.groupby("resume_id")["score_bm25"].rank(
    method="first", ascending=False
).astype(int)

out_bm25 = out_bm25[
    ["resume_id", "rank", "job_id", "job_title", "job_posting_url", "score_bm25"]
].sort_values(["resume_id", "rank"])

print("Final BM25 recommendations:", out_bm25.shape)
display(out_bm25.head(10))

# ==============================
# STEP 7 — Save + helper display
# ==============================
save_path_bm25 = os.path.join(PROC_DIR, "matches_bm25.csv")
out_bm25.to_csv(save_path_bm25, index=False)
print("BM25 data saved:", save_path_bm25)

print("\nBM25 score summary:")
print(out_bm25["score_bm25"].describe())

def show_top_matches_bm25(resume_id, top_k=10):
    label = resume_label.get(resume_id, "(unknown)")
    print(f"\n[BM25] Top matches for {resume_id} — '{label}':")
    df = (
        out_bm25[out_bm25["resume_id"] == resume_id]
        .sort_values("rank")
        .head(top_k)[["rank", "job_title", "job_posting_url", "score_bm25"]]
        .rename(columns={"score_bm25": "bm25_score"})
    )
    df["bm25_score"] = df["bm25_score"].round(3)
    display(df)

# Example usage after this cell:
# show_top_matches_bm25("IT_000362", 10)
# show_top_matches_bm25("PDF_000036", 10)


Resumes shape: (2158, 4)
Index(['resume_id', 'category', 'text_raw', 'text_clean'], dtype='object')

Jobs shape: (15851, 15)
Index(['job_id', 'job_title', 'job_description', 'location',
       'experience_level', 'work_type', 'min_salary', 'max_salary',
       'pay_period', 'currency', 'remote_allowed', 'sponsored',
       'job_posting_url', 'job_title_clean', 'job_description_clean'],
      dtype='object')

Sample resume text: accountant summary accountant for a medium sized company experience 01 2009 to current accountant company name ï1 4 city state hired by their cpa firm to handle all accounting and job cost reporting 0

Sample job text: licensed insurance agent while many industries were hurt by the last few years people still need insurance this position is an amazing opportunity for someone who wants a career in the insurance indus

=== BM25: Fitting CountVectorizer on resumes + jobs ===
Vocab size: 50000
X_res_counts shape: (2158, 50000)
X_job_counts shape: (15851, 50000)

===

Unnamed: 0,resume_id,rank,job_id,job_title,job_posting_url,score_bm25
109572,IT_000001,1,3697341474,Data Scientist,https://www.linkedin.com/jobs/view/3697341474/...,473.006602
178916,IT_000001,2,3701198711,Mentor - Machine Learning Career Track (Part-t...,https://www.linkedin.com/jobs/view/3701198711/...,432.913558
82204,IT_000001,3,3693581630,Senior Data Scientist,https://www.linkedin.com/jobs/view/3693581630/...,431.35196
82205,IT_000001,4,3693584453,Senior Data Scientist,https://www.linkedin.com/jobs/view/3693584453/...,431.35196
109573,IT_000001,5,3694121166,Natural Language Processing (NLP) Data Scientist.,https://www.linkedin.com/jobs/view/3694121166/...,425.561823
133840,IT_000001,6,3697375802,"Field Design & Evaluation, Associate Director",https://www.linkedin.com/jobs/view/3697375802/...,412.257768
50013,IT_000001,7,3693056342,"Senior Backend Software Engineer, Data",https://www.linkedin.com/jobs/view/3693056342/...,387.507986
155272,IT_000001,8,3699408896,Senior Data Scientist,https://www.linkedin.com/jobs/view/3699408896/...,381.51211
82206,IT_000001,9,3693598802,Treasury ALM Analyst,https://www.linkedin.com/jobs/view/3693598802/...,379.793145
18410,IT_000001,10,3693050174,Product Manager,https://www.linkedin.com/jobs/view/3693050174/...,377.546253


BM25 data saved: D:\Projects\ResumeJobRecommender\data\processed\matches_bm25.csv

BM25 score summary:
count    215700.000000
mean        395.908665
std         262.206405
min          14.641961
25%         195.432428
50%         378.998091
75%         527.614149
max        5143.305266
Name: score_bm25, dtype: float64


In [27]:
# Example usage after this cell:
show_top_matches_bm25("IT_000362", 10)
show_top_matches_bm25("PDF_000036", 10)


[BM25] Top matches for IT_000362 — 'Java Developer':


Unnamed: 0,rank,job_title,job_posting_url,bm25_score
23345,1,java developer W2,https://www.linkedin.com/jobs/view/3693046636/...,81.273
207874,2,Java Developer,https://www.linkedin.com/jobs/view/3701330202/...,73.326
207875,3,Senior Associate Software Engineer,https://www.linkedin.com/jobs/view/3701366243/...,70.905
23346,4,Senior Java Developer,https://www.linkedin.com/jobs/view/3693049156/...,69.73
23347,5,Senior Java Developer,https://www.linkedin.com/jobs/view/3693048252/...,69.123
55048,6,Java Developer,https://www.linkedin.com/jobs/view/3693067587/...,68.551
55049,7,Java Developer,https://www.linkedin.com/jobs/view/3693066747/...,68.551
182824,8,Java Software Engineer,https://www.linkedin.com/jobs/view/3699431844/...,66.553
23348,9,Java Developer (Only W2),https://www.linkedin.com/jobs/view/3693044444/...,66.335
23349,10,Java Software Engineer,https://www.linkedin.com/jobs/view/3693044773/...,65.238



[BM25] Top matches for PDF_000036 — 'ACCOUNTANT':


Unnamed: 0,rank,job_title,job_posting_url,bm25_score
189373,1,Senior Accountant,https://www.linkedin.com/jobs/view/3701317370/...,562.294
166433,2,Budget Analyst,https://www.linkedin.com/jobs/view/3701312625/...,560.073
189374,3,Assistant Controller,https://www.linkedin.com/jobs/view/3701319219/...,552.512
94431,4,Senior Accounting Analyst (Liabilities/AP),https://www.linkedin.com/jobs/view/3694153900/...,548.703
94432,5,Finance Manager,https://www.linkedin.com/jobs/view/3694103132/...,528.736
595,6,Senior Accountant,https://www.linkedin.com/jobs/view/3693043970/...,514.193
596,7,Accountant II,https://www.linkedin.com/jobs/view/3693050303/...,512.804
65708,8,Financial Controller,https://www.linkedin.com/jobs/view/3693582938/...,504.335
65709,9,Senior Construction Accountant (Req #: 116),https://www.linkedin.com/jobs/view/3693582916/...,500.833
65710,10,Senior Construction Accountant (Req: 116),https://www.linkedin.com/jobs/view/3693588010/...,499.394


### Model 5 - Word2Vec

In [28]:
# ============================================
# MODEL 5 — Word2Vec Embeddings + Cosine Similarity
# ============================================

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#!pip install gensim
from gensim.models import Word2Vec

print("\n=== MODEL 5: Word2Vec + Cosine ===")

# ------------------------------------------------
# STEP 1 — Build training corpus (resumes + jobs)
# ------------------------------------------------

# We already have:
#  - resumes["text_clean"]
#  - job_df["job_text_bm25"] = job_title_clean + job_description_clean

def simple_tokenize(text):
    # text_clean and job_text_bm25 should already be lowercased & cleaned
    # so split on whitespace is usually enough
    return str(text).split()

res_tokens = resumes["text_clean"].astype(str).apply(simple_tokenize).tolist()
job_tokens = job_df["job_text_bm25"].astype(str).apply(simple_tokenize).tolist()

corpus_tokens = res_tokens + job_tokens
print(f"Corpus sentences for Word2Vec: {len(corpus_tokens)}")

# ------------------------------------------------
# STEP 2 — Train Word2Vec model
# ------------------------------------------------

w2v_vector_size = 100
w2v_window = 5
w2v_min_count = 2

w2v_model = Word2Vec(
    sentences=corpus_tokens,
    vector_size=w2v_vector_size,
    window=w2v_window,
    min_count=w2v_min_count,
    workers=4,
    sg=1,              # skip-gram 
    epochs=10
)

print("Word2Vec vocab size:", len(w2v_model.wv))

# ------------------------------------------------
# STEP 3 — Build document embeddings (mean of word vectors)
# ------------------------------------------------

def doc_to_vec(tokens, model, vector_size):
    """Average word vectors for a list of tokens; return zero vector if no tokens in vocab."""
    valid_vecs = [model.wv[word] for word in tokens if word in model.wv]
    if not valid_vecs:
        return np.zeros(vector_size, dtype=np.float32)
    return np.mean(valid_vecs, axis=0)

# Resume embeddings
res_emb = np.vstack([
    doc_to_vec(toks, w2v_model, w2v_vector_size) for toks in res_tokens
])
print("res_emb shape:", res_emb.shape)  # (n_resumes, vector_size)

# Job embeddings
job_emb = np.vstack([
    doc_to_vec(toks, w2v_model, w2v_vector_size) for toks in job_tokens
])
print("job_emb shape:", job_emb.shape)  # (n_jobs, vector_size)

# ------------------------------------------------
# STEP 4 — Compute cosine similarities block-wise and keep top-k
# ------------------------------------------------

w2v_rows = []

print("\nScoring resumes vs jobs with Word2Vec cosine...")

for j_start in range(0, job_emb.shape[0], BLOCK_SIZE):
    j_end = min(j_start + BLOCK_SIZE, job_emb.shape[0])
    job_block = job_emb[j_start:j_end]   # (block_jobs, dim)

    print(f"Processed jobs {j_start}..{j_end-1}")

    # (n_res, dim) vs (block_jobs, dim) -> (n_res, block_jobs)
    sim_block = cosine_similarity(res_emb, job_block)

    n_res, n_block_jobs = sim_block.shape

    for res_idx in range(n_res):
        row_sim = sim_block[res_idx]

        if np.all(row_sim == 0):
            continue

        if n_block_jobs <= TOP_K:
            top_local_idx = np.argsort(-row_sim)
        else:
            top_local_idx = np.argpartition(-row_sim, TOP_K - 1)[:TOP_K]
            top_local_idx = top_local_idx[np.argsort(-row_sim[top_local_idx])]

        for j_local in top_local_idx:
            w2v_rows.append((
                res_idx,
                j_start + j_local,
                float(row_sim[j_local])   # score_w2v
            ))

# ------------------------------------------------
# STEP 5 — Build DataFrame + global top-k per resume
# ------------------------------------------------

w2v_df = pd.DataFrame(
    w2v_rows,
    columns=["resume_row", "job_row", "score_w2v"]
)

print("Raw w2v_df shape (before global top-k):", w2v_df.shape)

w2v_df["tmp_rank"] = w2v_df.groupby("resume_row")["score_w2v"].rank(
    method="first",
    ascending=False
)

w2v_df = w2v_df[w2v_df["tmp_rank"] <= TOP_K].drop(columns=["tmp_rank"])
print("w2v_df shape (after global top-k):", w2v_df.shape)

# ------------------------------------------------
# STEP 6 — Attach IDs, titles, URLs, and rank
# ------------------------------------------------

out_w2v = w2v_df.merge(
    res_map,
    left_on="resume_row",
    right_on="row_idx",
    how="left"
).merge(
    job_map,
    left_on="job_row",
    right_on="row_idx",
    how="left",
    suffixes=("_res", "_job")
)

out_w2v = out_w2v.rename(
    columns={"resume_id": "resume_id", "job_id": "job_id"}
)[["resume_id", "job_id", "score_w2v"]]

out_w2v = out_w2v.merge(jobs_meta, on="job_id", how="left")

out_w2v["rank"] = out_w2v.groupby("resume_id")["score_w2v"].rank(
    method="first",
    ascending=False
).astype(int)

out_w2v = out_w2v[
    ["resume_id", "rank", "job_id", "job_title", "job_posting_url", "score_w2v"]
].sort_values(["resume_id", "rank"])

print("Final Word2Vec recommendations:", out_w2v.shape)
display(out_w2v.head(10))

# ------------------------------------------------
# STEP 7 — Save + helper display
# ------------------------------------------------

save_path_w2v = os.path.join(PROC_DIR, "matches_w2v_cosine.csv")
out_w2v.to_csv(save_path_w2v, index=False)
print("Word2Vec cosine data saved:", save_path_w2v)

print("\nWord2Vec cosine score summary:")
print(out_w2v["score_w2v"].describe())

def show_top_matches_w2v(resume_id, top_k=10):
    """Top-k jobs by Word2Vec cosine similarity for a given resume_id."""
    label = resume_label.get(resume_id, "(unknown)")
    print(f"\n[Word2Vec] Top matches for {resume_id} — '{label}':")
    df = (
        out_w2v[out_w2v["resume_id"] == resume_id]
        .sort_values("rank")
        .head(top_k)[["rank", "job_title", "job_posting_url", "score_w2v"]]
        .rename(columns={"score_w2v": "w2v_score"})
    )
    df["w2v_score"] = df["w2v_score"].round(3)
    display(df)



=== MODEL 5: Word2Vec + Cosine ===
Corpus sentences for Word2Vec: 18009
Word2Vec vocab size: 53786
res_emb shape: (2158, 100)
job_emb shape: (15851, 100)

Scoring resumes vs jobs with Word2Vec cosine...
Processed jobs 0..1999
Processed jobs 2000..3999
Processed jobs 4000..5999
Processed jobs 6000..7999
Processed jobs 8000..9999
Processed jobs 10000..11999
Processed jobs 12000..13999
Processed jobs 14000..15850
Raw w2v_df shape (before global top-k): (1725600, 3)
w2v_df shape (after global top-k): (215700, 3)
Final Word2Vec recommendations: (215700, 6)


Unnamed: 0,resume_id,rank,job_id,job_title,job_posting_url,score_w2v
20877,IT_000001,1,3693046152,Senior Data Engineer,https://www.linkedin.com/jobs/view/3693046152/...,0.960867
20878,IT_000001,2,3693047136,Senior Data Engineer,https://www.linkedin.com/jobs/view/3693047136/...,0.960867
20879,IT_000001,3,3693041960,Senior Data Engineer,https://www.linkedin.com/jobs/view/3693041960/...,0.960867
20880,IT_000001,4,3693047167,Senior Data Engineer,https://www.linkedin.com/jobs/view/3693047167/...,0.960867
20881,IT_000001,5,3693043742,Senior Data Engineer,https://www.linkedin.com/jobs/view/3693043742/...,0.960867
61191,IT_000001,6,3693052292,NLP Engineer with Java,https://www.linkedin.com/jobs/view/3693052292/...,0.959906
61192,IT_000001,7,3693056059,NLP Engineer,https://www.linkedin.com/jobs/view/3693056059/...,0.95956
20882,IT_000001,8,3693047544,Data Scientists / AIML Engineer,https://www.linkedin.com/jobs/view/3693047544/...,0.958766
119702,IT_000001,9,3697341474,Data Scientist,https://www.linkedin.com/jobs/view/3697341474/...,0.953128
61193,IT_000001,10,3693067787,"Assistant Vice President, Quantitative Finance...",https://www.linkedin.com/jobs/view/3693067787/...,0.951936


Word2Vec cosine data saved: D:\Projects\ResumeJobRecommender\data\processed\matches_w2v_cosine.csv

Word2Vec cosine score summary:
count    215700.000000
mean          0.927419
std           0.052012
min           0.653431
25%           0.914089
50%           0.946911
75%           0.961346
max           0.990508
Name: score_w2v, dtype: float64


In [29]:
# Example comparisons (after cell runs):
show_top_matches_bm25("IT_000362", 10)    # BM25
show_top_matches_w2v("IT_000362", 10)     # Word2Vec cosine


[BM25] Top matches for IT_000362 — 'Java Developer':


Unnamed: 0,rank,job_title,job_posting_url,bm25_score
23345,1,java developer W2,https://www.linkedin.com/jobs/view/3693046636/...,81.273
207874,2,Java Developer,https://www.linkedin.com/jobs/view/3701330202/...,73.326
207875,3,Senior Associate Software Engineer,https://www.linkedin.com/jobs/view/3701366243/...,70.905
23346,4,Senior Java Developer,https://www.linkedin.com/jobs/view/3693049156/...,69.73
23347,5,Senior Java Developer,https://www.linkedin.com/jobs/view/3693048252/...,69.123
55048,6,Java Developer,https://www.linkedin.com/jobs/view/3693067587/...,68.551
55049,7,Java Developer,https://www.linkedin.com/jobs/view/3693066747/...,68.551
182824,8,Java Software Engineer,https://www.linkedin.com/jobs/view/3699431844/...,66.553
23348,9,Java Developer (Only W2),https://www.linkedin.com/jobs/view/3693044444/...,66.335
23349,10,Java Software Engineer,https://www.linkedin.com/jobs/view/3693044773/...,65.238



[Word2Vec] Top matches for IT_000362 — 'Java Developer':


Unnamed: 0,rank,job_title,job_posting_url,w2v_score
209313,1,Java Developer,https://www.linkedin.com/jobs/view/3701330202/...,0.844
159617,2,Python Developer,https://www.linkedin.com/jobs/view/3699084912/...,0.837
28576,3,"Murex BO/MXML Consultant : Chicago, IL/Toronto...",https://www.linkedin.com/jobs/view/3693048443/...,0.823
28577,4,java developer W2,https://www.linkedin.com/jobs/view/3693046636/...,0.822
159618,5,Jr. DevOps,https://www.linkedin.com/jobs/view/3699059418/...,0.813
66871,6,Java Script Developer,https://www.linkedin.com/jobs/view/3693063901/...,0.808
99097,7,Java Full Stack developer,https://www.linkedin.com/jobs/view/3693073635/...,0.807
209314,8,Java Developer with Payment,https://www.linkedin.com/jobs/view/3701332088/...,0.806
209315,9,Mechanical Engineer,https://www.linkedin.com/jobs/view/3701371856/...,0.805
159619,10,Information Technology Operations Specialist,https://www.linkedin.com/jobs/view/3699086528/...,0.805


### Model 6: Doc2Vec

In [34]:
# ============================================
# MODEL 6 — Doc2Vec Document Embeddings + Cosine Similarity
# ============================================

import os
import numpy as np
import pandas as pd

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

# CONFIG
TOP_K = 100
BLOCK_SIZE = 2000 

print("Resumes shape:", resumes.shape)
print("Jobs shape:", job_df.shape)

# --------------------------------------------
# STEP 1 — Ensure combined job text exists
# --------------------------------------------

if "job_text_bm25" not in job_df.columns:
    job_df["job_text_bm25"] = (
        job_df["job_title_clean"].fillna("").astype(str) + " " +
        job_df["job_description_clean"].fillna("").astype(str)
    )

# --------------------------------------------
# STEP 2 — Tokenization helper
# --------------------------------------------

def simple_tokenize(text: str):
    return str(text).split()

res_tokens = resumes["text_clean"].astype(str).apply(simple_tokenize).tolist()
job_tokens = job_df["job_text_bm25"].astype(str).apply(simple_tokenize).tolist()

print("\nBuilding TaggedDocument corpus for Doc2Vec...")
tagged_docs = []

# Tag resumes as RES_0, RES_1, ...
for i, tokens in enumerate(res_tokens):
    tagged_docs.append(TaggedDocument(words=tokens, tags=[f"RES_{i}"]))

# Tag jobs as JOB_0, JOB_1, ...
for j, tokens in enumerate(job_tokens):
    tagged_docs.append(TaggedDocument(words=tokens, tags=[f"JOB_{j}"]))

print("Total tagged docs (resumes + jobs):", len(tagged_docs))

# --------------------------------------------
# STEP 3 — Train Doc2Vec model
# --------------------------------------------

doc2vec_vector_size = 100
doc2vec_window = 5
doc2vec_min_count = 2
doc2vec_epochs = 15

print("\n=== MODEL 6: Doc2Vec + Cosine ===")
print(f"Training Doc2Vec: vector_size={doc2vec_vector_size}, window={doc2vec_window}, "
      f"min_count={doc2vec_min_count}, epochs={doc2vec_epochs}")

doc2vec_model = Doc2Vec(
    vector_size=doc2vec_vector_size,
    window=doc2vec_window,
    min_count=doc2vec_min_count,
    workers=4,
    dm=1,            # PV-DM (Distributed Memory)
    epochs=doc2vec_epochs
)

doc2vec_model.build_vocab(tagged_docs)
print("Doc2Vec vocab size:", len(doc2vec_model.wv))

doc2vec_model.train(
    tagged_docs,
    total_examples=len(tagged_docs),
    epochs=doc2vec_epochs
)

print("Doc2Vec training complete.")

# --------------------------------------------
# STEP 4 — Extract resume & job embeddings from model
# --------------------------------------------

n_resumes = len(resumes)
n_jobs = len(job_df)

res_emb_doc2 = np.vstack([
    doc2vec_model.dv[f"RES_{i}"] for i in range(n_resumes)
])
job_emb_doc2 = np.vstack([
    doc2vec_model.dv[f"JOB_{j}"] for j in range(n_jobs)
])

print("res_emb_doc2 shape:", res_emb_doc2.shape)
print("job_emb_doc2 shape:", job_emb_doc2.shape)

# --------------------------------------------
# STEP 5 — Compute cosine similarities block-wise & keep top-k
# --------------------------------------------

doc2_rows = []

print("\nScoring resumes vs jobs with Doc2Vec cosine...")

for j_start in range(0, job_emb_doc2.shape[0], BLOCK_SIZE):
    j_end = min(j_start + BLOCK_SIZE, job_emb_doc2.shape[0])
    job_block = job_emb_doc2[j_start:j_end]   # (block_jobs, dim)

    print(f"Processed jobs {j_start}..{j_end-1}")

    # (n_res, dim) vs (block_jobs, dim) -> (n_res, block_jobs)
    sim_block = cosine_similarity(res_emb_doc2, job_block)

    n_res, n_block_jobs = sim_block.shape

    for res_idx in range(n_res):
        row_sim = sim_block[res_idx]
        if np.all(row_sim == 0):
            continue

        if n_block_jobs <= TOP_K:
            top_local_idx = np.argsort(-row_sim)
        else:
            top_local_idx = np.argpartition(-row_sim, TOP_K - 1)[:TOP_K]
            top_local_idx = top_local_idx[np.argsort(-row_sim[top_local_idx])]

        for j_local in top_local_idx:
            doc2_rows.append((
                res_idx,
                j_start + j_local,
                float(row_sim[j_local])   # score_doc2vec
            ))

doc2_df = pd.DataFrame(
    doc2_rows,
    columns=["resume_row", "job_row", "score_doc2vec"]
)

print("Raw doc2_df shape (before global top-k):", doc2_df.shape)

# Global top-k per resume
doc2_df["tmp_rank"] = doc2_df.groupby("resume_row")["score_doc2vec"].rank(
    method="first",
    ascending=False
)
doc2_df = doc2_df[doc2_df["tmp_rank"] <= TOP_K].drop(columns=["tmp_rank"])
print("doc2_df shape (after global top-k):", doc2_df.shape)

# --------------------------------------------
# STEP 6 — Attach IDs, titles, URLs, rank
# --------------------------------------------

# Row index maps 
res_map = resumes[["resume_id"]].assign(row_idx=np.arange(len(resumes)))
job_map = job_df[["job_id"]].assign(row_idx=np.arange(len(job_df)))
jobs_meta = job_df[["job_id", "job_title", "job_posting_url"]]

out_doc2 = doc2_df.merge(
    res_map,
    left_on="resume_row",
    right_on="row_idx",
    how="left"
).merge(
    job_map,
    left_on="job_row",
    right_on="row_idx",
    how="left",
    suffixes=("_res", "_job")
)

out_doc2 = out_doc2.rename(
    columns={"resume_id": "resume_id", "job_id": "job_id"}
)[["resume_id", "job_id", "score_doc2vec"]]

out_doc2 = out_doc2.merge(jobs_meta, on="job_id", how="left")

out_doc2["rank"] = out_doc2.groupby("resume_id")["score_doc2vec"].rank(
    method="first",
    ascending=False
).astype(int)

out_doc2 = out_doc2[
    ["resume_id", "rank", "job_id", "job_title", "job_posting_url", "score_doc2vec"]
].sort_values(["resume_id", "rank"])

print("Final Doc2Vec recommendations:", out_doc2.shape)
display(out_doc2.head(10))

# --------------------------------------------
# STEP 7 — Save + helper display
# --------------------------------------------

save_path_doc2 = os.path.join(PROC_DIR, "matches_doc2vec_cosine.csv")
out_doc2.to_csv(save_path_doc2, index=False)
print("\nDoc2Vec cosine data saved:", save_path_doc2)

print("\nDoc2Vec cosine score summary:")
print(out_doc2["score_doc2vec"].describe())

# Build resume_label dict
if "resume_label" not in globals():
    resume_label = dict(zip(resumes["resume_id"], resumes["category"]))

def show_top_matches_doc2vec(resume_id, top_k=10):
    """Top-k jobs by Doc2Vec cosine similarity for a given resume_id."""
    label = resume_label.get(resume_id, "(unknown)")
    print(f"\n[Doc2Vec] Top matches for {resume_id} — '{label}':")
    df = (
        out_doc2[out_doc2["resume_id"] == resume_id]
        .sort_values("rank")
        .head(top_k)[["rank", "job_title", "job_posting_url", "score_doc2vec"]]
        .rename(columns={"score_doc2vec": "doc2vec_score"})
    )
    df["doc2vec_score"] = df["doc2vec_score"].round(3)
    display(df)

# Example:
# show_top_matches_doc2vec("IT_000362", 10)
# show_top_matches_doc2vec("PDF_000036", 10)


Resumes shape: (2158, 4)
Jobs shape: (15851, 16)

Building TaggedDocument corpus for Doc2Vec...
Total tagged docs (resumes + jobs): 18009

=== MODEL 6: Doc2Vec + Cosine ===
Training Doc2Vec: vector_size=100, window=5, min_count=2, epochs=15
Doc2Vec vocab size: 53786
Doc2Vec training complete.
res_emb_doc2 shape: (2158, 100)
job_emb_doc2 shape: (15851, 100)

Scoring resumes vs jobs with Doc2Vec cosine...
Processed jobs 0..1999
Processed jobs 2000..3999
Processed jobs 4000..5999
Processed jobs 6000..7999
Processed jobs 8000..9999
Processed jobs 10000..11999
Processed jobs 12000..13999
Processed jobs 14000..15850
Raw doc2_df shape (before global top-k): (1726400, 3)
doc2_df shape (after global top-k): (215800, 3)
Final Doc2Vec recommendations: (215800, 6)


Unnamed: 0,resume_id,rank,job_id,job_title,job_posting_url,score_doc2vec
113852,IT_000001,1,3697387964,Data Scientist,https://www.linkedin.com/jobs/view/3697387964/...,0.654657
91987,IT_000001,2,3694110451,"Product Manager II, Developer Experience",https://www.linkedin.com/jobs/view/3694110451/...,0.632915
113853,IT_000001,3,3697390531,Data Scientist,https://www.linkedin.com/jobs/view/3697390531/...,0.626228
91988,IT_000001,4,3694109438,"Product Manager II, Developer Experience",https://www.linkedin.com/jobs/view/3694109438/...,0.620586
91989,IT_000001,5,3694108504,"Product Manager II, Developer Experience",https://www.linkedin.com/jobs/view/3694108504/...,0.616725
91990,IT_000001,6,3694105798,"Product Manager II, Developer Experience",https://www.linkedin.com/jobs/view/3694105798/...,0.603794
91991,IT_000001,7,3694109439,"Product Manager II, Developer Experience",https://www.linkedin.com/jobs/view/3694109439/...,0.603669
91992,IT_000001,8,3694107528,"Product Manager II, Developer Experience",https://www.linkedin.com/jobs/view/3694107528/...,0.603622
91993,IT_000001,9,3694107526,"Product Manager II, Developer Experience",https://www.linkedin.com/jobs/view/3694107526/...,0.596445
113854,IT_000001,10,3697386905,Data Architect (Visualization),https://www.linkedin.com/jobs/view/3697386905/...,0.595859



Doc2Vec cosine data saved: D:\Projects\ResumeJobRecommender\data\processed\matches_doc2vec_cosine.csv

Doc2Vec cosine score summary:
count    215800.000000
mean          0.588123
std           0.120274
min           0.206484
25%           0.503052
50%           0.556961
75%           0.646655
max           0.963917
Name: score_doc2vec, dtype: float64


In [35]:
show_top_matches_doc2vec("IT_000362", 10)      # Java Developer
show_top_matches_doc2vec("PDF_000036", 10)     # Accountant



[Doc2Vec] Top matches for IT_000362 — 'Java Developer':


Unnamed: 0,rank,job_title,job_posting_url,doc2vec_score
167562,1,Sales Director [Owner/Operator],https://www.linkedin.com/jobs/view/3701310277/...,0.923
167563,2,Sales Director [Owner/Operator],https://www.linkedin.com/jobs/view/3701308805/...,0.918
167564,3,Sales Director [Owner/Operator],https://www.linkedin.com/jobs/view/3701308719/...,0.915
167565,4,Sales Director [Owner/Operator],https://www.linkedin.com/jobs/view/3701307269/...,0.915
167566,5,Sales Director [Owner/Operator],https://www.linkedin.com/jobs/view/3701307305/...,0.914
167567,6,Sales Director [Owner/Operator],https://www.linkedin.com/jobs/view/3701307319/...,0.914
167568,7,Sales Director [Owner/Operator],https://www.linkedin.com/jobs/view/3701309012/...,0.914
167569,8,Sales Director [Owner/Operator],https://www.linkedin.com/jobs/view/3701310292/...,0.912
167570,9,Sales Director [Owner/Operator],https://www.linkedin.com/jobs/view/3701308393/...,0.911
167571,10,Sales Director [Owner/Operator],https://www.linkedin.com/jobs/view/3701307149/...,0.911



[Doc2Vec] Top matches for PDF_000036 — 'ACCOUNTANT':


Unnamed: 0,rank,job_title,job_posting_url,doc2vec_score
186032,1,Accounts Payable Specialist,https://www.linkedin.com/jobs/view/3701317870/...,0.62
77150,2,Payroll Manager,https://www.linkedin.com/jobs/view/3694110354/...,0.617
437,3,Payroll and Benefits Specialist,https://www.linkedin.com/jobs/view/3693044530/...,0.598
47585,4,Payroll Specialist 2,https://www.linkedin.com/jobs/view/3693071386/...,0.598
186033,5,Senior Revenue Accountant,https://www.linkedin.com/jobs/view/3701313654/...,0.596
77151,6,Controller,https://www.linkedin.com/jobs/view/3694103934/...,0.584
119125,7,Corporate Accountant,https://www.linkedin.com/jobs/view/3699087357/...,0.583
139291,8,Accounting Manager,https://www.linkedin.com/jobs/view/3701197445/...,0.581
139292,9,Payroll Specialist - (ADP Vantage),https://www.linkedin.com/jobs/view/3701302395/...,0.578
119126,10,Accounting Representative,https://www.linkedin.com/jobs/view/3699061031/...,0.575


### Model 7 : SBERT

In [36]:
!pip install torch sentence-transformers




In [39]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"            # disable TensorFlow
os.environ["TOKENIZERS_PARALLELISM"] = "false"    # avoid tokenizer warnings

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import os


In [40]:
# ============================================
# MODEL 7 — Sentence-BERT (MiniLM) + Cosine Similarity
# ============================================

print("\n=== MODEL 6: Sentence-BERT + Cosine ===")

# ------------------------------------------------
# STEP 1 — Prepare input texts
# ------------------------------------------------
# We’ll use:
#   - resumes["text_clean"]
#   - job_df["job_text_bm25"] = title + description (already created for BM25)

res_texts = resumes["text_clean"].astype(str).tolist()
job_texts = job_df["job_text_bm25"].astype(str).tolist()

print(f"# resumes: {len(res_texts)}, # jobs: {len(job_texts)}")
TOP_K = 100
BLOCK_SIZE = 2000

# ------------------------------------------------
# STEP 2 — Load Sentence-BERT model
# ------------------------------------------------
sbert_model_name = "sentence-transformers/all-MiniLM-L6-v2"
sbert = SentenceTransformer(sbert_model_name)

print("Loaded SBERT model:", sbert_model_name)

# ------------------------------------------------
# STEP 3 — Encode resumes & jobs into embeddings
# ------------------------------------------------
# We normalize embeddings so cosine ≈ dot product

res_emb_bert = sbert.encode(
    res_texts,
    batch_size=64,
    show_progress_bar=False,   # <- TURN OFF Progress Bar
    convert_to_numpy=True,
    normalize_embeddings=True
)
print("res_emb_bert shape:", res_emb_bert.shape)

job_emb_bert = sbert.encode(
    job_texts,
    batch_size=64,
    show_progress_bar=False,   # <- TURN OFF Progress Bar
    convert_to_numpy=True,
    normalize_embeddings=True
)
print("job_emb_bert shape:", job_emb_bert.shape)

# ------------------------------------------------
# STEP 4 — Compute cosine similarities block-wise and keep top-k
# ------------------------------------------------

bert_rows = []

print("\nScoring resumes vs jobs with SBERT cosine...")

for j_start in range(0, job_emb_bert.shape[0], BLOCK_SIZE):
    j_end = min(j_start + BLOCK_SIZE, job_emb_bert.shape[0])
    job_block = job_emb_bert[j_start:j_end]   # (block_jobs, dim)

    print(f"Processed jobs {j_start}..{j_end-1}")

    # (n_res, dim) vs (block_jobs, dim) -> (n_res, block_jobs)
    sim_block = cosine_similarity(res_emb_bert, job_block)

    n_res, n_block_jobs = sim_block.shape

    for res_idx in range(n_res):
        row_sim = sim_block[res_idx]

        if np.all(row_sim == 0):
            continue

        if n_block_jobs <= TOP_K:
            top_local_idx = np.argsort(-row_sim)   # descending
        else:
            top_local_idx = np.argpartition(-row_sim, TOP_K - 1)[:TOP_K]
            top_local_idx = top_local_idx[np.argsort(-row_sim[top_local_idx])]

        for j_local in top_local_idx:
            bert_rows.append((
                res_idx,
                j_start + j_local,
                float(row_sim[j_local])   # score_bert
            ))

# ------------------------------------------------
# STEP 5 — Build DataFrame + global top-k per resume
# ------------------------------------------------

bert_df = pd.DataFrame(
    bert_rows,
    columns=["resume_row", "job_row", "score_bert"]
)

print("Raw bert_df shape (before global top-k):", bert_df.shape)

bert_df["tmp_rank"] = bert_df.groupby("resume_row")["score_bert"].rank(
    method="first",
    ascending=False
)
bert_df = bert_df[bert_df["tmp_rank"] <= TOP_K].drop(columns=["tmp_rank"])
print("bert_df shape (after global top-k):", bert_df.shape)

# ------------------------------------------------
# STEP 6 — Attach IDs, titles, URLs, and rank
# ------------------------------------------------

out_bert = bert_df.merge(
    res_map,
    left_on="resume_row",
    right_on="row_idx",
    how="left"
).merge(
    job_map,
    left_on="job_row",
    right_on="row_idx",
    how="left",
    suffixes=("_res", "_job")
)

out_bert = out_bert.rename(
    columns={"resume_id": "resume_id", "job_id": "job_id"}
)[["resume_id", "job_id", "score_bert"]]

out_bert = out_bert.merge(jobs_meta, on="job_id", how="left")

out_bert["rank"] = out_bert.groupby("resume_id")["score_bert"].rank(
    method="first",
    ascending=False
).astype(int)

out_bert = out_bert[
    ["resume_id", "rank", "job_id", "job_title", "job_posting_url", "score_bert"]
].sort_values(["resume_id", "rank"])

print("Final SBERT recommendations:", out_bert.shape)
display(out_bert.head(10))

# ------------------------------------------------
# STEP 7 — Save + helper display
# ------------------------------------------------

save_path_bert = os.path.join(PROC_DIR, "matches_bert_cosine.csv")
out_bert.to_csv(save_path_bert, index=False)
print("SBERT cosine data saved:", save_path_bert)

print("\nSBERT cosine score summary:")
print(out_bert["score_bert"].describe())

def show_top_matches_bert(resume_id, top_k=10):
    """Top-k jobs by SBERT cosine similarity for a given resume_id."""
    label = resume_label.get(resume_id, "(unknown)")
    print(f"\n[SBERT] Top matches for {resume_id} — '{label}':")
    df = (
        out_bert[out_bert["resume_id"] == resume_id]
        .sort_values("rank")
        .head(top_k)[["rank", "job_title", "job_posting_url", "score_bert"]]
        .rename(columns={"score_bert": "bert_score"})
    )
    df["bert_score"] = df["bert_score"].round(3)
    display(df)

# Example usage after this cell:
# show_top_matches_bert("IT_000362", 10)
# show_top_matches_bert("PDF_000036", 10)



=== MODEL 6: Sentence-BERT + Cosine ===
# resumes: 2158, # jobs: 15851
Loaded SBERT model: sentence-transformers/all-MiniLM-L6-v2
res_emb_bert shape: (2158, 384)
job_emb_bert shape: (15851, 384)

Scoring resumes vs jobs with SBERT cosine...
Processed jobs 0..1999
Processed jobs 2000..3999
Processed jobs 4000..5999
Processed jobs 6000..7999
Processed jobs 8000..9999
Processed jobs 10000..11999
Processed jobs 12000..13999
Processed jobs 14000..15850
Raw bert_df shape (before global top-k): (1726400, 3)
bert_df shape (after global top-k): (215800, 3)
Final SBERT recommendations: (215800, 6)


Unnamed: 0,resume_id,rank,job_id,job_title,job_posting_url,score_bert
57382,IT_000001,1,3693052292,NLP Engineer with Java,https://www.linkedin.com/jobs/view/3693052292/...,0.607991
205827,IT_000001,2,3701330352,Director Data Architect,https://www.linkedin.com/jobs/view/3701330352/...,0.584993
154383,IT_000001,3,3699408120,Metadata Administrator,https://www.linkedin.com/jobs/view/3699408120/...,0.584568
57383,IT_000001,4,3693056059,NLP Engineer,https://www.linkedin.com/jobs/view/3693056059/...,0.573859
20701,IT_000001,5,3693045603,Data Scientist,https://www.linkedin.com/jobs/view/3693045603/...,0.570285
57384,IT_000001,6,3693056161,Data Analytics Engineer,https://www.linkedin.com/jobs/view/3693056161/...,0.564042
205828,IT_000001,7,3701315556,Data Engineer (W2 ONLY),https://www.linkedin.com/jobs/view/3701315556/...,0.562951
57385,IT_000001,8,3693067760,Python Developer,https://www.linkedin.com/jobs/view/3693067760/...,0.561258
57386,IT_000001,9,3693065757,Performance Engineer/Architect,https://www.linkedin.com/jobs/view/3693065757/...,0.555793
20702,IT_000001,10,3693047544,Data Scientists / AIML Engineer,https://www.linkedin.com/jobs/view/3693047544/...,0.55498


SBERT cosine data saved: D:\Projects\ResumeJobRecommender\data\processed\matches_bert_cosine.csv

SBERT cosine score summary:
count    215800.000000
mean          0.589153
std           0.069733
min           0.149208
25%           0.546733
50%           0.594225
75%           0.637348
max           0.841269
Name: score_bert, dtype: float64


In [41]:
show_top_matches_bert("IT_000362", 10)
show_top_matches_bert("PDF_000036", 10)



[SBERT] Top matches for IT_000362 — 'Java Developer':


Unnamed: 0,rank,job_title,job_posting_url,bert_score
29126,1,Java Consultant,https://www.linkedin.com/jobs/view/3693047709/...,0.614
29127,2,Senior Backend Developer (Java),https://www.linkedin.com/jobs/view/3693050123/...,0.602
29128,3,java developer W2,https://www.linkedin.com/jobs/view/3693046636/...,0.58
29129,4,Java Software Engineer,https://www.linkedin.com/jobs/view/3693045676/...,0.578
209736,5,Embedded Developer,https://www.linkedin.com/jobs/view/3701314304/...,0.576
91106,6,Java Software Engineer,https://www.linkedin.com/jobs/view/3693074456/...,0.559
29130,7,Sr Java Developer,https://www.linkedin.com/jobs/view/3693043740/...,0.544
185414,8,Senior Java Developer,https://www.linkedin.com/jobs/view/3701310988/...,0.537
29131,9,C++ Software Engineer,https://www.linkedin.com/jobs/view/3693044310/...,0.534
209737,10,Lead Senior Web Developer (2824),https://www.linkedin.com/jobs/view/3701322390/...,0.523



[SBERT] Top matches for PDF_000036 — 'ACCOUNTANT':


Unnamed: 0,rank,job_title,job_posting_url,bert_score
714,1,Senior Accountant,https://www.linkedin.com/jobs/view/3693041917/...,0.781
715,2,Accountant II,https://www.linkedin.com/jobs/view/3693050303/...,0.777
72555,3,Staff Accountant,https://www.linkedin.com/jobs/view/3693596617/...,0.774
123387,4,Senior Technical Accounting Specialist,https://www.linkedin.com/jobs/view/3697389157/...,0.773
123388,5,Accountant,https://www.linkedin.com/jobs/view/3699052941/...,0.771
716,6,Accountant,https://www.linkedin.com/jobs/view/3693048365/...,0.768
43059,7,Senior Accountant,https://www.linkedin.com/jobs/view/3693051168/...,0.764
194051,8,Staff Accountant,https://www.linkedin.com/jobs/view/3701325800/...,0.759
98810,9,Corporate Accountant,https://www.linkedin.com/jobs/view/3694153334/...,0.757
717,10,Senior Staff Accountant,https://www.linkedin.com/jobs/view/3693046986/...,0.755


### Model 8: Fuzzy Logic

In [42]:
# ============================================
# MODEL 8 — Fuzzy Title Matching (category vs job_title_clean)
# ============================================

import os
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

# CONFIG
TOP_K = 100   # top jobs per resume
N_RES_FUZZY = None   
N_JOBS_FUZZY = None 


print("Resumes shape:", resumes.shape)
print("Jobs shape:", job_df.shape)

# Basic sanity
assert "category" in resumes.columns, "resumes must have a 'category' column"
assert "job_title_clean" in job_df.columns, "job_df must have 'job_title_clean'"

# --------------------------------------------
# STEP 1 — sampling
# --------------------------------------------
if N_RES_FUZZY is not None and N_RES_FUZZY < len(resumes):
    resumes_fuzzy = resumes.sample(n=N_RES_FUZZY, random_state=42).reset_index(drop=True)
else:
    resumes_fuzzy = resumes.reset_index(drop=True)

if N_JOBS_FUZZY is not None and N_JOBS_FUZZY < len(job_df):
    job_df_fuzzy = job_df.sample(n=N_JOBS_FUZZY, random_state=42).reset_index(drop=True)
else:
    job_df_fuzzy = job_df.reset_index(drop=True)

print("\nFuzzy matching on:")
print("  resumes_fuzzy shape:", resumes_fuzzy.shape)
print("  job_df_fuzzy shape:", job_df_fuzzy.shape)

# --------------------------------------------
# STEP 2 — Helper: title similarity
# --------------------------------------------
def title_similarity(a: str, b: str) -> float:
    """
    Fuzzy similarity between two titles (0..1).
    Uses SequenceMatcher; you can later swap for rapidfuzz if desired.
    """
    a = (a or "").lower()
    b = (b or "").lower()
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a, b).ratio()

# --------------------------------------------
# STEP 3 — Compute fuzzy scores and keep top-K per resume
# --------------------------------------------
fuzzy_rows = []

job_titles = job_df_fuzzy["job_title_clean"].astype(str).tolist()
job_ids    = job_df_fuzzy["job_id"].tolist()

print("\n=== MODEL 7: Fuzzy Title Matching (category vs job_title_clean) ===")

for r_idx, r in resumes_fuzzy.iterrows():
    resume_id = r["resume_id"]
    resume_cat = str(r["category"]) if pd.notna(r["category"]) else ""
    if not resume_cat:
        # Skip if no category
        continue

    if r_idx % 100 == 0:
        print(f"Processing resume {r_idx}/{len(resumes_fuzzy)} — {resume_id} — '{resume_cat}'")

    # Compute similarity against all job titles
    scores = []
    for j_idx, (job_id, job_title) in enumerate(zip(job_ids, job_titles)):
        sim = title_similarity(resume_cat, job_title)
        scores.append((job_id, sim))

    # Sort and keep top-K
    scores.sort(key=lambda x: x[1], reverse=True)
    top_scores = scores[:TOP_K]

    for rank, (job_id, sim) in enumerate(top_scores, start=1):
        fuzzy_rows.append((resume_id, job_id, rank, float(sim)))

# --------------------------------------------
# STEP 4 — Build DataFrame, attach job metadata, save
# --------------------------------------------
fuzzy_df = pd.DataFrame(
    fuzzy_rows,
    columns=["resume_id", "job_id", "rank", "score_fuzzy"]
)

print("\nRaw fuzzy_df shape:", fuzzy_df.shape)

# Attach job metadata (title + URL) from full job_df 
jobs_meta = job_df[["job_id", "job_title", "job_posting_url"]]

out_fuzzy = fuzzy_df.merge(jobs_meta, on="job_id", how="left")

# Re-order columns
out_fuzzy = out_fuzzy[
    ["resume_id", "rank", "job_id", "job_title", "job_posting_url", "score_fuzzy"]
].sort_values(["resume_id", "rank"])

print("Final fuzzy title recommendations:", out_fuzzy.shape)
display(out_fuzzy.head(10))

# --------------------------------------------
# STEP 5 — Save + helper display
# --------------------------------------------
save_path_fuzzy = os.path.join(PROC_DIR, "matches_fuzzy_title.csv")
out_fuzzy.to_csv(save_path_fuzzy, index=False)
print("\nFuzzy title data saved:", save_path_fuzzy)

print("\nFuzzy title score summary:")
print(out_fuzzy["score_fuzzy"].describe())

# Build resume label dict if not already created
if "resume_label" not in globals():
    resume_label = dict(zip(resumes["resume_id"], resumes["category"]))

def show_top_matches_fuzzy(resume_id, top_k=10):
    """Top-k jobs by fuzzy title similarity for a given resume_id."""
    label = resume_label.get(resume_id, "(unknown)")
    print(f"\n[Fuzzy] Top matches for {resume_id} — '{label}':")
    df = (
        out_fuzzy[out_fuzzy["resume_id"] == resume_id]
        .sort_values("rank")
        .head(top_k)[["rank", "job_title", "job_posting_url", "score_fuzzy"]]
        .rename(columns={"score_fuzzy": "fuzzy_score"})
    )
    df["fuzzy_score"] = df["fuzzy_score"].round(3)
    display(df)

# Example:
# show_top_matches_fuzzy("IT_000362", 10)
# show_top_matches_fuzzy("PDF_000036", 10)


Resumes shape: (2158, 4)
Jobs shape: (15851, 16)

Fuzzy matching on:
  resumes_fuzzy shape: (2158, 4)
  job_df_fuzzy shape: (15851, 16)

=== MODEL 7: Fuzzy Title Matching (category vs job_title_clean) ===
Processing resume 0/2158 — PDF_000001 — 'ACCOUNTANT'
Processing resume 100/2158 — PDF_000101 — 'BANKING'
Processing resume 200/2158 — PDF_000201 — 'BUSINESS-DEVELOPMENT'
Processing resume 300/2158 — PDF_000301 — 'CONSULTANT'
Processing resume 400/2158 — PDF_000401 — 'CONSULTANT'
Processing resume 500/2158 — PDF_000501 — 'DIGITAL-MEDIA'
Processing resume 600/2158 — PDF_000601 — 'ENGINEERING'
Processing resume 700/2158 — PDF_000701 — 'FINANCE'
Processing resume 800/2158 — PDF_000801 — 'HR'
Processing resume 900/2158 — PDF_000901 — 'INFORMATION-TECHNOLOGY'
Processing resume 1000/2158 — PDF_001001 — 'PUBLIC-RELATIONS'
Processing resume 1100/2158 — PDF_001101 — 'SALES'
Processing resume 1200/2158 — IT_000005 — 'Data Science'
Processing resume 1300/2158 — IT_000105 — 'Arts'
Processing resum

Unnamed: 0,resume_id,rank,job_id,job_title,job_posting_url,score_fuzzy
119600,IT_000001,1,3693045603,Data Scientist,https://www.linkedin.com/jobs/view/3693045603/...,0.769231
119601,IT_000001,2,3693052150,Data Scientist,https://www.linkedin.com/jobs/view/3693052150/...,0.769231
119602,IT_000001,3,3693053166,Data Scientist,https://www.linkedin.com/jobs/view/3693053166/...,0.769231
119603,IT_000001,4,3693583847,Data Scientist,https://www.linkedin.com/jobs/view/3693583847/...,0.769231
119604,IT_000001,5,3697341474,Data Scientist,https://www.linkedin.com/jobs/view/3697341474/...,0.769231
119605,IT_000001,6,3697387964,Data Scientist,https://www.linkedin.com/jobs/view/3697387964/...,0.769231
119606,IT_000001,7,3697388794,Data Scientist,https://www.linkedin.com/jobs/view/3697388794/...,0.769231
119607,IT_000001,8,3697390531,Data Scientist,https://www.linkedin.com/jobs/view/3697390531/...,0.769231
119608,IT_000001,9,3697391430,Data Scientist,https://www.linkedin.com/jobs/view/3697391430/...,0.769231
119609,IT_000001,10,3699085103,Data Scientist,https://www.linkedin.com/jobs/view/3699085103/...,0.769231



Fuzzy title data saved: D:\Projects\ResumeJobRecommender\data\processed\matches_fuzzy_title.csv

Fuzzy title score summary:
count    215800.000000
mean          0.595068
std           0.157904
min           0.285714
25%           0.476190
50%           0.583333
75%           0.717949
max           1.000000
Name: score_fuzzy, dtype: float64


In [43]:
show_top_matches_fuzzy("IT_000362", 10)      # Java Developer
show_top_matches_fuzzy("PDF_000036", 10)     # Accountant



[Fuzzy] Top matches for IT_000362 — 'Java Developer':


Unnamed: 0,rank,job_title,job_posting_url,fuzzy_score
155700,1,Java developer,https://www.linkedin.com/jobs/view/3693050150/...,1.0
155701,2,Java Developer,https://www.linkedin.com/jobs/view/3693066747/...,1.0
155702,3,Java Developer,https://www.linkedin.com/jobs/view/3693067587/...,1.0
155703,4,Java Developer,https://www.linkedin.com/jobs/view/3697364133/...,1.0
155704,5,Java Developer,https://www.linkedin.com/jobs/view/3701307311/...,1.0
155705,6,Java Developer,https://www.linkedin.com/jobs/view/3701330202/...,1.0
155706,7,Sr Java Developer,https://www.linkedin.com/jobs/view/3693043740/...,0.903
155707,8,java developer W2,https://www.linkedin.com/jobs/view/3693046636/...,0.903
155708,9,Sr. Java Developer,https://www.linkedin.com/jobs/view/3699412829/...,0.903
155709,10,Sr. Java Developers,https://www.linkedin.com/jobs/view/3693043951/...,0.875



[Fuzzy] Top matches for PDF_000036 — 'ACCOUNTANT':


Unnamed: 0,rank,job_title,job_posting_url,fuzzy_score
3500,1,Accountant,https://www.linkedin.com/jobs/view/3674585247/...,1.0
3501,2,Accountant,https://www.linkedin.com/jobs/view/3693048365/...,1.0
3502,3,Accountant,https://www.linkedin.com/jobs/view/3693052011/...,1.0
3503,4,Accountant,https://www.linkedin.com/jobs/view/3693071158/...,1.0
3504,5,Accountant,https://www.linkedin.com/jobs/view/3693073456/...,1.0
3505,6,Accountant,https://www.linkedin.com/jobs/view/3693589113/...,1.0
3506,7,Accountant,https://www.linkedin.com/jobs/view/3694102651/...,1.0
3507,8,Accountant,https://www.linkedin.com/jobs/view/3694108167/...,1.0
3508,9,Accountant,https://www.linkedin.com/jobs/view/3697386744/...,1.0
3509,10,Accountant,https://www.linkedin.com/jobs/view/3699052941/...,1.0


### Ensemble

In [44]:
    # ============================================
    # Ensemble (SBERT + BM25 + Fuzzy)
    # ============================================
    
    import os
    import numpy as np
    import pandas as pd
    
    PROC_DIR = r"D:\Projects\ResumeJobRecommender\data\processed"
    
    # --------------------------
    # STEP 1 — Load model outputs
    # --------------------------
    bm25_path   = os.path.join(PROC_DIR, "matches_bm25.csv")
    bert_path   = os.path.join(PROC_DIR, "matches_bert_cosine.csv")
    fuzzy_path  = os.path.join(PROC_DIR, "matches_fuzzy_title.csv")
    
    bm25_df  = pd.read_csv(bm25_path)
    bert_df  = pd.read_csv(bert_path)
    fuzzy_df = pd.read_csv(fuzzy_path)
    
    print("BM25:", bm25_df.shape)
    print("SBERT:", bert_df.shape)
    print("Fuzzy:", fuzzy_df.shape)
    
    # BM25: use score_bm25 and rename to bm25_score
    bm25_df_small = bm25_df[["resume_id", "job_id", "score_bm25"]].rename(
        columns={"score_bm25": "bm25_score"}
    )
    
    # SBERT: score_bert 
    bert_df_small = bert_df[["resume_id", "job_id", "score_bert"]]
    
    # Fuzzy: score_fuzzy 
    fuzzy_df_small = fuzzy_df[["resume_id", "job_id", "score_fuzzy"]]
    
    # --------------------------
    # STEP 2 — Merge them
    # --------------------------
    ensemble = bm25_df_small.merge(
        bert_df_small, on=["resume_id", "job_id"], how="outer"
    ).merge(
        fuzzy_df_small, on=["resume_id", "job_id"], how="outer"
    )
    
    print("\nEnsemble merged table:", ensemble.shape)
    
    # --------------------------
    # STEP 3 — Normalize scores to [0, 1]
    # --------------------------
    def minmax_norm(series):
        s = series.astype(float)
        s_min = s.min()
        s_max = s.max()
        if pd.isna(s_min) or s_max == s_min:
            return pd.Series(np.zeros(len(s)), index=series.index)
        return (s - s_min) / (s_max - s_min)
    
    ensemble["bm25_norm"]  = minmax_norm(ensemble["bm25_score"])
    ensemble["bert_norm"]  = minmax_norm(ensemble["score_bert"])
    ensemble["fuzzy_norm"] = minmax_norm(ensemble["score_fuzzy"])
    
    # If any missing scores, treat as 0 contribution
    for col in ["bm25_norm", "bert_norm", "fuzzy_norm"]:
        ensemble[col] = ensemble[col].fillna(0.0)
    
    # --------------------------
    # STEP 4 — Weighted final score
    # --------------------------
    w_bert  = 0.5
    w_bm25  = 0.3
    w_fuzzy = 0.2
    
    ensemble["final_score"] = (
        w_bert  * ensemble["bert_norm"] +
        w_bm25  * ensemble["bm25_norm"] +
        w_fuzzy * ensemble["fuzzy_norm"]
    )
    
    # --------------------------
    # STEP 5 — Final rank within each resume
    # --------------------------
    ensemble["final_rank"] = (
        ensemble.groupby("resume_id")["final_score"]
        .rank(method="first", ascending=False)
    )
    
    print("\nEnsemble table with scores & ranks:", ensemble.shape)
    display(ensemble.head(10))
    
    # --------------------------
    # STEP 6 — Attach job metadata and save
    # --------------------------
    jobs = pd.read_csv(os.path.join(PROC_DIR, "job_postings_cleaned.csv"))
    jobs_meta = jobs[["job_id", "job_title", "job_posting_url"]]
    
    ensemble_final = ensemble.merge(jobs_meta, on="job_id", how="left")
    ensemble_final = ensemble_final.sort_values(["resume_id", "final_rank"])
    
    save_path = os.path.join(PROC_DIR, "matches_ensemble_weighted.csv")
    ensemble_final.to_csv(save_path, index=False)
    
    print("\nEnsemble (SBERT + BM25 + Fuzzy) saved:", save_path)
    
    print("\nFinal score summary:")
    print(ensemble_final["final_score"].describe())
    
    # --------------------------
    # STEP 7 — show top matches for a resume
    # --------------------------
    def show_top_matches_ensemble(resume_id, top_k=10):
        df = (
            ensemble_final[ensemble_final["resume_id"] == resume_id]
            .sort_values("final_rank")
            .head(top_k)[[
                "final_rank", "job_title", "job_posting_url",
                "final_score", "bert_norm", "bm25_norm", "fuzzy_norm"
            ]]
        )
        df["final_score"] = df["final_score"].round(4)
        df["bert_norm"]   = df["bert_norm"].round(3)
        df["bm25_norm"]   = df["bm25_norm"].round(3)
        df["fuzzy_norm"]  = df["fuzzy_norm"].round(3)
        print(f"\n[Ensemble] Top matches for {resume_id}:")
        display(df)


BM25: (215700, 6)
SBERT: (215800, 6)
Fuzzy: (215800, 6)

Ensemble merged table: (581423, 5)

Ensemble table with scores & ranks: (581423, 10)


Unnamed: 0,resume_id,job_id,bm25_score,score_bert,score_fuzzy,bm25_norm,bert_norm,fuzzy_norm,final_score,final_rank
0,IT_000001,3682818140,,0.497878,,0.0,0.503814,0.0,0.251907,83.0
1,IT_000001,3690869468,,0.507638,,0.0,0.517916,0.0,0.258958,63.0
2,IT_000001,3691795980,,0.504148,0.64,0.0,0.512875,0.496,0.355637,7.0
3,IT_000001,3692302089,314.220899,,0.52381,0.058413,0.0,0.333333,0.08419,144.0
4,IT_000001,3693037942,,,0.540541,0.0,0.0,0.356757,0.071351,169.0
5,IT_000001,3693040943,,,0.516129,0.0,0.0,0.322581,0.064516,187.0
6,IT_000001,3693043839,371.681652,,,0.069617,0.0,0.0,0.020885,197.0
7,IT_000001,3693043881,,0.521092,,0.0,0.537358,0.0,0.268679,42.0
8,IT_000001,3693044267,313.640317,,,0.058299,0.0,0.0,0.01749,238.0
9,IT_000001,3693044446,,,0.518519,0.0,0.0,0.325926,0.065185,186.0



Ensemble (SBERT + BM25 + Fuzzy) saved: D:\Projects\ResumeJobRecommender\data\processed\matches_ensemble_weighted.csv

Final score summary:
count    581423.000000
mean          0.158396
std           0.143660
min           0.000000
25%           0.030414
50%           0.099200
75%           0.303676
max           0.708342
Name: final_score, dtype: float64


In [45]:
show_top_matches_ensemble("IT_000362", 10)
show_top_matches_ensemble("PDF_000036", 10)
show_top_matches_ensemble("IT_000417", 10)


[Ensemble] Top matches for IT_000362:


Unnamed: 0,final_rank,job_title,job_posting_url,final_score,bert_norm,bm25_norm,fuzzy_norm
97533,1.0,java developer W2,https://www.linkedin.com/jobs/view/3693046636/...,0.4879,0.622,0.013,0.865
97674,2.0,Java Developer,https://www.linkedin.com/jobs/view/3701307311/...,0.4716,0.538,0.009,1.0
97699,3.0,Java Developer,https://www.linkedin.com/jobs/view/3701330202/...,0.4666,0.526,0.011,1.0
97503,4.0,Sr Java Developer,https://www.linkedin.com/jobs/view/3693043740/...,0.4607,0.571,0.007,0.865
97558,5.0,Java developer,https://www.linkedin.com/jobs/view/3693050150/...,0.4501,0.495,0.008,1.0
97677,6.0,Senior Java Developer,https://www.linkedin.com/jobs/view/3701310988/...,0.4264,0.56,0.008,0.72
97651,7.0,AEM Developer,https://www.linkedin.com/jobs/view/3699062042/...,0.3978,0.497,0.004,0.741
97697,8.0,Java Developer - W2 ONLY,https://www.linkedin.com/jobs/view/3701323681/...,0.3957,0.51,0.009,0.689
97507,9.0,Sr. Java Developers,https://www.linkedin.com/jobs/view/3693043951/...,0.3934,0.455,0.004,0.825
97543,10.0,Senior Java Developer,https://www.linkedin.com/jobs/view/3693048252/...,0.3873,0.48,0.011,0.72



[Ensemble] Top matches for PDF_000036:


Unnamed: 0,final_rank,job_title,job_posting_url,final_score,bert_norm,bm25_norm,fuzzy_norm
265366,1.0,Accountant,https://www.linkedin.com/jobs/view/3699052941/...,0.6717,0.898,0.075,1.0
265236,2.0,Accountant,https://www.linkedin.com/jobs/view/3693048365/...,0.647,0.894,0.0,1.0
265246,3.0,Accountant II,https://www.linkedin.com/jobs/view/3693050303/...,0.6464,0.908,0.097,0.817
265425,4.0,Accountant,https://www.linkedin.com/jobs/view/3701332708/...,0.6423,0.838,0.078,1.0
265334,5.0,Accountant,https://www.linkedin.com/jobs/view/3694108167/...,0.6264,0.853,0.0,1.0
265420,6.0,Tax Accountant,https://www.linkedin.com/jobs/view/3701327001/...,0.6044,0.855,0.078,0.767
265418,7.0,Staff Accountant,https://www.linkedin.com/jobs/view/3701325800/...,0.5995,0.881,0.078,0.677
265250,8.0,Senior Accountant,https://www.linkedin.com/jobs/view/3693051168/...,0.5969,0.889,0.084,0.637
265309,9.0,Staff Accountant,https://www.linkedin.com/jobs/view/3693596617/...,0.5871,0.903,0.0,0.677
265213,10.0,Senior Accountant,https://www.linkedin.com/jobs/view/3693041917/...,0.5841,0.913,0.0,0.637



[Ensemble] Top matches for IT_000417:


Unnamed: 0,final_rank,job_title,job_posting_url,final_score,bert_norm,bm25_norm,fuzzy_norm
110656,1.0,Business Analyst,https://www.linkedin.com/jobs/view/3701199444/...,0.5568,0.714,0.0,1.0
110715,2.0,Finance Analyst,https://www.linkedin.com/jobs/view/3701367033/...,0.5088,0.78,0.0,0.594
110508,3.0,Business Operations Analyst,https://www.linkedin.com/jobs/view/3693055170/...,0.5029,0.749,0.0,0.642
110561,4.0,Business System Analyst,https://www.linkedin.com/jobs/view/3694102333/...,0.5029,0.706,0.0,0.749
110673,5.0,Business System Analyst,https://www.linkedin.com/jobs/view/3701305414/...,0.4972,0.695,0.0,0.749
110598,6.0,Sr. Technical Business Analyst,https://www.linkedin.com/jobs/view/3697308696/...,0.4947,0.751,0.0,0.596
110442,7.0,Enterprise Business Analyst,https://www.linkedin.com/jobs/view/3690835135/...,0.4737,0.691,0.0,0.642
110689,8.0,Budget Analyst,https://www.linkedin.com/jobs/view/3701312625/...,0.4732,0.696,0.0,0.627
110458,9.0,Jr. Financial Analyst,https://www.linkedin.com/jobs/view/3693046191/...,0.4039,0.808,0.0,0.0
110505,10.0,Staff Accountant,https://www.linkedin.com/jobs/view/3693053268/...,0.3833,0.767,0.0,0.0


### Model 9 – Resume-Resume Nearest-Neighbor Recommender

In [3]:
# ============================================================
# MODEL 9 — Resume–Resume Nearest-Neighbor Recommender
# ============================================================

import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.neighbors import NearestNeighbors

# -----------------------------
# Paths & basic config
# -----------------------------
BASE_DIR = r"D:\Projects\ResumeJobRecommender"
PROC_DIR = BASE_DIR + r"\data\processed"
    
NEIGH_K = 10          # number of similar resumes to use
TOP_BASE_JOBS = 50    # No. of top jobs per neighbor to pull from ensemble
TOP_FINAL_K = 100     # no. of jobs per resume to keep in this model

print("PROC_DIR:", PROC_DIR)

# -----------------------------
# 1. Load resumes + jobs metadata
# -----------------------------
# Adjust filenames if yours differ
resumes = pd.read_csv(PROC_DIR + r"\resume_cleaned.csv")
jobs = pd.read_csv(PROC_DIR + r"\job_postings_cleaned.csv")

print("Resumes shape:", resumes.shape)
print("Jobs shape:", jobs.shape)
print(resumes.columns)
print(jobs.columns)

# Retaining job_id, title, URL for attaching metadata later
jobs_meta = jobs[["job_id", "job_title", "job_posting_url"]].drop_duplicates("job_id")

# -----------------------------
# 2. Load resume TF-IDF matrix + row index map
# -----------------------------
X_res = sparse.load_npz(PROC_DIR + r"\resume_tfidf_matrix.npz")
print("X_res shape:", X_res.shape)

res_map = pd.read_csv(PROC_DIR + r"\resume_tfidf_rowindex.csv")
print("res_map head:")
print(res_map.head())

# sort by row_idx so row order matches matrix rows
res_map = res_map.sort_values("row_idx").reset_index(drop=True)

n_resumes = X_res.shape[0]

# -----------------------------
# 3. Fit NearestNeighbors on resume TF-IDF
# -----------------------------
print("\nFitting NearestNeighbors on resume TF-IDF...")
nn = NearestNeighbors(
    n_neighbors=NEIGH_K + 1,   # +1 to include self, which we'll drop
    metric="cosine",
    n_jobs=-1
)
nn.fit(X_res)

# For all resumes at once
distances, indices = nn.kneighbors(X_res, return_distance=True)
# distances, indices shape: (n_resumes, NEIGH_K+1)

# -----------------------------
# 4. Build neighbor dataframe (resume_id -> neighbor_resume_id, sim)
# -----------------------------
rows = []
for i in range(n_resumes):
    # Skip the first neighbor, which is the resume itself (distance ~ 0)
    for j in range(1, NEIGH_K + 1):
        src_row = i
        nbr_row = indices[i, j]
        dist = distances[i, j]
        sim = 1.0 - dist   # cosine distance -> cosine similarity
        rows.append((src_row, nbr_row, sim))

neighbors_df = pd.DataFrame(rows, columns=["resume_row", "neighbor_row", "sim"])
print("\nNeighbors df shape:", neighbors_df.shape)
print(neighbors_df.head())

# Attach resume_id and neighbor_resume_id via res_map
neighbors_df = neighbors_df.merge(
    res_map[["row_idx", "resume_id"]].rename(
        columns={"row_idx": "resume_row", "resume_id": "resume_id"}
    ),
    on="resume_row",
    how="left"
)

neighbors_df = neighbors_df.merge(
    res_map[["row_idx", "resume_id"]].rename(
        columns={"row_idx": "neighbor_row", "resume_id": "neighbor_resume_id"}
    ),
    on="neighbor_row",
    how="left"
)

print("\nNeighbors with IDs:")
print(neighbors_df.head())

# -----------------------------
# 5. Load base model (ensemble) recommendations
#    We will "propagate" these via neighbors
# -----------------------------
ensemble_path = PROC_DIR + r"\matches_ensemble_weighted.csv"
ens = pd.read_csv(ensemble_path)
print("\nEnsemble shape:", ens.shape)
print(ens.columns)

# Keep only top TOP_BASE_JOBS per resume from ensemble, to reduce noise
if "final_score" in ens.columns:
    score_col = "final_score"
elif "score" in ens.columns:
    score_col = "score"
else:
    raise ValueError("Could not find a score column in ensemble file.")

ens_base = (
    ens.sort_values(["resume_id", score_col], ascending=[True, False])
       .groupby("resume_id")
       .head(TOP_BASE_JOBS)
       [["resume_id", "job_id", score_col]]
       .rename(columns={score_col: "base_score"})
)

print("\nEnsemble base (top jobs per resume):", ens_base.shape)
print(ens_base.head())

# -----------------------------
# 6. Join neighbors with base jobs
#    For each target resume, we look at its neighbor resumes and
#    take the jobs that were good for those neighbors.
# -----------------------------
# neighbors_df: resume_id, neighbor_resume_id, sim
# ens_base: resume_id, job_id, base_score  (resume_id here is neighbor)

neighbor_jobs = neighbors_df.merge(
    ens_base,
    left_on="neighbor_resume_id",
    right_on="resume_id",
    how="left",
    suffixes=("", "_nbr")
)

# Remove the extra 'resume_id_nbr' if it appears:
if "resume_id_nbr" in neighbor_jobs.columns:
    neighbor_jobs = neighbor_jobs.drop(columns=["resume_id_nbr"])

# Keep only rows where we actually had base jobs
neighbor_jobs = neighbor_jobs.dropna(subset=["base_score"])
print("\nNeighbor-jobs shape:", neighbor_jobs.shape)
print(neighbor_jobs.head())

# -----------------------------
# 7. Compute weighted scores:
#    final_score = sum_over_neighbors( sim * base_score )
# -----------------------------
neighbor_jobs["weighted_score"] = neighbor_jobs["sim"] * neighbor_jobs["base_score"]

agg = (
    neighbor_jobs
    .groupby(["resume_id", "job_id"], as_index=False)
    .agg(
        score_neighbor=("weighted_score", "sum"),
        max_sim=("sim", "max")
    )
)

print("\nAggregated neighbor-based scores shape:", agg.shape)
print(agg.head())

# -----------------------------
# 8. Attach job metadata and rank per resume
# -----------------------------
agg = agg.merge(jobs_meta, on="job_id", how="left")

# Rank within each resume by neighbor score
agg["rank"] = agg.groupby("resume_id")["score_neighbor"] \
                 .rank(method="first", ascending=False) \
                 .astype(int)

agg_sorted = agg.sort_values(["resume_id", "rank"])

# restrict to TOP_FINAL_K per resume
agg_sorted = (
    agg_sorted[agg_sorted["rank"] <= TOP_FINAL_K]
    .reset_index(drop=True)
)

model9 = agg_sorted[[
    "resume_id",
    "rank",
    "job_id",
    "job_title",
    "job_posting_url",
    "score_neighbor",
    "max_sim"
]]

print("\nFinal Model 9 (neighbor-based) recommendations:", model9.shape)
print(model9.head(10))

# -----------------------------
# 9. Save results
# -----------------------------
save_path_model9 = PROC_DIR + r"\matches_neighbor_model9.csv"
model9.to_csv(save_path_model9, index=False)
print("\nModel 9 neighbor recommendations saved to:", save_path_model9)

# -----------------------------
# 10. Helper function to inspect top matches
# -----------------------------
# Build a label dict from resumes (category)
label_dict = dict(zip(resumes["resume_id"], resumes["category"]))

def show_top_matches_neighbor(resume_id, top_k=10):
    """Pretty-print top-k jobs from neighbor-based model."""
    label = label_dict.get(resume_id, "(unknown)")
    print(f"\n[Model 9 — Neighbor] Top matches for {resume_id} — '{label}':")
    df = (
        model9[model9["resume_id"] == resume_id]
        .sort_values("rank")
        .head(top_k)
        [["rank", "job_title", "job_posting_url", "score_neighbor", "max_sim"]]
    )
    df["score_neighbor"] = df["score_neighbor"].round(4)
    df["max_sim"] = df["max_sim"].round(4)
    display(df)

# Example checks 
show_top_matches_neighbor("IT_000362", 10)     # Java Developer
show_top_matches_neighbor("PDF_000036", 10)    # Accountant


PROC_DIR: D:\Projects\ResumeJobRecommender\data\processed
Resumes shape: (2158, 4)
Jobs shape: (15851, 15)
Index(['resume_id', 'category', 'text_raw', 'text_clean'], dtype='object')
Index(['job_id', 'job_title', 'job_description', 'location',
       'experience_level', 'work_type', 'min_salary', 'max_salary',
       'pay_period', 'currency', 'remote_allowed', 'sponsored',
       'job_posting_url', 'job_title_clean', 'job_description_clean'],
      dtype='object')
X_res shape: (2158, 50000)
res_map head:
    resume_id  row_idx
0  PDF_000001        0
1  PDF_000002        1
2  PDF_000003        2
3  PDF_000004        3
4  PDF_000005        4

Fitting NearestNeighbors on resume TF-IDF...

Neighbors df shape: (21580, 3)
   resume_row  neighbor_row       sim
0           0            49  0.267571
1           0           660  0.241334
2           0             7  0.235363
3           0           734  0.215179
4           0             2  0.205692

Neighbors with IDs:
   resume_row  neighbor_ro

Unnamed: 0,rank,job_title,job_posting_url,score_neighbor,max_sim
33092,1,java developer W2,https://www.linkedin.com/jobs/view/3693046636/...,2.9651,1.0
33093,2,Java Developer,https://www.linkedin.com/jobs/view/3701307311/...,2.9,1.0
33094,3,Sr Java Developer,https://www.linkedin.com/jobs/view/3693043740/...,2.8101,1.0
33095,4,Java Developer,https://www.linkedin.com/jobs/view/3701330202/...,2.805,1.0
33096,5,Java developer,https://www.linkedin.com/jobs/view/3693050150/...,2.7141,1.0
33097,6,Senior Java Developer,https://www.linkedin.com/jobs/view/3701310988/...,2.6273,1.0
33098,7,Sr. Java Developers,https://www.linkedin.com/jobs/view/3693043951/...,2.4433,1.0
33099,8,Java Developer - W2 ONLY,https://www.linkedin.com/jobs/view/3701323681/...,2.3923,1.0
33100,9,Senior Java Developer,https://www.linkedin.com/jobs/view/3693048252/...,2.3862,1.0
33101,10,Java Full Stack Developer,https://www.linkedin.com/jobs/view/3693584825/...,2.329,1.0



[Model 9 — Neighbor] Top matches for PDF_000036 — 'ACCOUNTANT':


Unnamed: 0,rank,job_title,job_posting_url,score_neighbor,max_sim
89534,1,Staff Accountant,https://www.linkedin.com/jobs/view/3696373506/...,1.5013,0.2918
89535,2,Staff Accountant,https://www.linkedin.com/jobs/view/3693596617/...,1.4466,0.2918
89536,3,Staff Accountant,https://www.linkedin.com/jobs/view/3693069265/...,1.4292,0.2918
89537,4,Accountant II,https://www.linkedin.com/jobs/view/3693050303/...,1.3927,0.2918
89538,5,Senior Accountant,https://www.linkedin.com/jobs/view/3693051168/...,1.3072,0.2814
89539,6,Senior Accountant,https://www.linkedin.com/jobs/view/3696374362/...,1.287,0.2918
89540,7,Staff Accountant,https://www.linkedin.com/jobs/view/3700365455/...,1.2085,0.2918
89541,8,Staff Accountant,https://www.linkedin.com/jobs/view/3701314485/...,1.1937,0.2918
89542,9,Staff Accountant,https://www.linkedin.com/jobs/view/3693048156/...,1.1834,0.2918
89543,10,Staff Accountant,https://www.linkedin.com/jobs/view/3701325800/...,1.1615,0.2814
