In [None]:
import pandas as pd
import numpy as np
import torch

### Objective:
Build a classification model that predicts whether a given resume matches a job description, using text processing, embeddings, and machine learning techniques. This project is highly relevant for HR-AI applications, helping automate and improve candidate screening.

In [2]:
df = pd.read_csv('resume_job_matching.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   JD      12 non-null     object
 1   Resume  12 non-null     object
 2   Label   12 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 416.0+ bytes


In [3]:
df.head()

Unnamed: 0,JD,Resume,Label
0,Looking for a Python developer with experience...,"Experienced Python engineer skilled in ML, clo...",1
1,"Seeking a data analyst skilled in SQL, Tableau...","Data scientist with Tableau knowledge, SQL exp...",1
2,Hiring Android developer with Java and Kotlin ...,Android programmer with Java and Swift expertise.,0
3,Need a content writer familiar with SEO and Wo...,"Copywriter experienced in articles, SEO optimi...",1
4,Looking for a React.js frontend developer.,Frontend developer skilled in Angular and Vue.js.,0


In [25]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your CSV
df = pd.read_csv('resume_job_matching.csv')

# Basic cleaning function
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation/special chars
    return text

# Apply cleaning to both columns
df['JD_clean'] = df['JD'].apply(clean_text)
df['Resume_clean'] = df['Resume'].apply(clean_text)



In [26]:
tfidf = TfidfVectorizer()
# Combine JD and Resume for vectorization (optional, can be separate too)
combined_text = df['JD_clean'] + ' ' + df['Resume_clean']
features = tfidf.fit_transform(combined_text)


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(combined_text)


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score




X = tfidf.fit_transform(combined_text)
y = df['Label']

# 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 4. Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 5. Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Accuracy: 0.75
Precision: 0.75
Recall: 1.0
F1 Score: 0.8571428571428571


## Testing with new samples

In [37]:
# New test samples
new_samples = [
    {
        "JD": "Hiring a full-stack developer proficient in React and Node.js.",
        "Resume": "Full-stack engineer with 3 years of React, Node.js, and MongoDB experience."
    },
    {
        "JD": "Looking for a project manager with PMP certification and Agile experience.",
        "Resume": "Software developer experienced in Java and Python; no formal project management training."
    }
]

# Preprocess and vectorize
def prepare_text(jd, resume):
    text = f"{jd} {resume}".lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # (Optional) stopword removal if applied in training
    return text

texts = [prepare_text(s['JD'], s['Resume']) for s in new_samples]
X_new = tfidf.transform(texts)

# Predict
predictions = model.predict(X_new)
for i, pred in enumerate(predictions):
    label = "Match" if pred == 1 else "No Match"
    print(f"Sample {i+1}: {label}")


Sample 1: Match
Sample 2: Match


In [38]:
probs = model.predict_proba(X_new)[:,1]  # probability of “Match”
for i, p in enumerate(probs):
    print(f"Sample {i+1} match probability: {p:.2f}")


Sample 1 match probability: 0.61
Sample 2 match probability: 0.62


In [39]:
new_samples2=[{"JD": "Seeking a cybersecurity specialist with expertise in network security and penetration testing.",
"Resume": "Information security analyst skilled in network defense, vulnerability assessment, and ethical hacking."
}]

In [40]:
def prepare_text(jd, resume):
    text = f"{jd} {resume}".lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # (Optional) stopword removal if applied in training
    return text

texts = [prepare_text(s['JD'], s['Resume']) for s in new_samples2]
X_new = tfidf.transform(texts)

# Predict
predictions = model.predict(X_new)
for i, pred in enumerate(predictions):
    label = "Match" if pred == 1 else "No Match"
    print(f"Sample {i+1}: {label}")

Sample 1: Match
