In [None]:
import pandas as pd

cleaned_df = pd.read_csv("Dataset/Cleaned_Dataset/cleaned_resume_screening.csv")

In [None]:
cleaned_df['combined_text'] = cleaned_df['Resume'] + ' [SEP] ' + cleaned_df['Job_Description']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(y='Decision', data=cleaned_df)
plt.title('Decision Class Distribution')
plt.xlabel('Decision')
plt.ylabel('Count')
plt.show()

In [None]:
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import numpy as np


# Tokenizing the combined text
X_tokenized = [simple_preprocess(text) for text in cleaned_df['combined_text']]
y = cleaned_df['Decision']

# Train Word2Vec model with tokenized text
word2vec_model = Word2Vec(sentences=X_tokenized, vector_size=1000, window=5, min_count=1, workers=4)

# Function to get the average word vector for a document
def get_average_word2vec(tokens_list, model, vector_size=1000):
    vectors = [model.wv[token] for token in tokens_list if token in model.wv]
    if len(vectors) == 0:  # If no words are found in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

# Convert the entire dataset into word vectors
X_word2vec = np.array([get_average_word2vec(tokens, word2vec_model) for tokens in X_tokenized])

# Split the data into train and test sets after converting to word vectors
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.3, random_state=42)

# Check the shapes of the splits
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


In [None]:
import pickle

vector_data = {
    "X_train" : X_train,
    "X_test" : X_test,
    "y_train" : y_train,
    "y_test" : y_test
}

with open("Dataset/vector_embeddings.pkl" , "wb") as f :
    pickle.dump(obj= vector_data , file=f)


In [None]:
import pickle


with open("Dataset/vector_embeddings.pkl" , "rb") as f :
    vector_embeddings = pickle.load(file=f)

X_train = vector_embeddings['X_train']
X_test = vector_embeddings['X_test']
y_train = vector_embeddings['y_train']
y_test = vector_embeddings['y_test']

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
cleaned_df['combined_text'].head()[0]

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_classifier = SGDClassifier(
    loss='log_loss',  # Logistic regression (log loss)
    penalty='l2',  # Regularization
    max_iter=1000,  # Max number of iterations
    tol=1e-3,  # Stopping criterion
    random_state=42,
)

sgd_classifier.partial_fit(X_train, y_train, classes=y.unique())  # Initial fit on the training data

y_pred = sgd_classifier.predict(X_test)


sgd_classifier.score(X_test , y_test)

In [None]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1],  # Regularization strength
    'penalty': ['l2', 'l1', 'elasticnet'],  # Regularization type
    'max_iter': [1000, 2000],  # Number of iterations
    'learning_rate': ['constant', 'optimal', 'invscaling'],  # Learning rate schedule
    'tol': [1e-4, 1e-3],  # Tolerance for stopping criteria
}

# Initialize SGDClassifier
sgd_classifier = SGDClassifier()

# Initialize GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(sgd_classifier, param_grid, cv=3, n_jobs=-1, verbose=2)

# Perform Grid Search to find the best parameters
grid_search.fit(X_train, y_train)

# Get best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Reject', 'Hire'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score , r2_score


print("Accuracy :-" , accuracy_score(y_pred , y_test))
print("R2 score :-" , r2_score(y_test , y_pred))



In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

# For binary classification
y_prob = sgd_classifier.predict_proba(X_test)[:, 1]  # Probability of "Hire"
auc = roc_auc_score(y_test, y_prob)
print(f"ROC-AUC Score: {auc:.4f}")

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:

import pickle

# -------------------------------

model_data = {
    'vectorizer': vectorizer,
    'classifier': sgd_classifier
}

with open('resume_screening_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Model and vectorizer saved to 'resume_screening_model.pkl'")

# -------------------------------
# 4. UNPICKLE (LOAD) AND INFERENCE
# -------------------------------
# Simulate loading in a new session
with open('resume_screening_model.pkl', 'rb') as f:
    loaded = pickle.load(f)

vectorizer = loaded['vectorizer']
classifier = loaded['classifier']

print("Model and vectorizer loaded successfully!")



In [None]:
def predict_hiring_decision(resume_text, job_description):
    # Combine
    combined_text = resume_text + " " + job_description
    combined_text = combined_text.strip()

    # Vectorize
    text_vector = vectorizer.transform([combined_text])

    # Predict
    prediction = classifier.predict(text_vector)[0]
    probability = classifier.predict_proba(text_vector)[0]

    decision = "Hire" if prediction == 1 else "Reject"
    prob_hire = probability[1] if prediction == 1 else probability[0]

    return {
        'Decision': decision,
        'Probability (Hire)': round(probability[1], 4),
        'Confidence': round(max(probability), 4)
    }

sample_idx = 0
sample_resume = df['Resume'].iloc[X_test.index[sample_idx]]
sample_job = df['Job_Description'].iloc[X_test.index[sample_idx]]
true_label = y_test.iloc[sample_idx]

print("\n" + "="*50)
print("INFERENCE EXAMPLE")
print("="*50)
print(f"True Label: {true_label} ({'Hire' if true_label == 1 else 'Reject'})")
print(f"Resume snippet: {sample_resume[:100]}...")
print(f"Job Desc snippet: {sample_job[:100]}...")

result = predict_hiring_decision(sample_resume, sample_job)
print("\nModel Prediction:")
for k, v in result.items():
    print(f"  {k}: {v}")

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained model from Sentence-Transformers
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to calculate cosine similarity and classify the resume
def classify_resume(resume_text, job_desc_text):
    # Get sentence embeddings for both resume and job description
    resume_embedding = model.encode(resume_text)
    job_desc_embedding = model.encode(job_desc_text)
    
    # Calculate cosine similarity
    similarity = cosine_similarity([resume_embedding], [job_desc_embedding])[0][0]
    
    # Print similarity and classify the resume
    print(f"Cosine Similarity: {similarity * 100:.2f}%")
    
    # If similarity is greater than 80%, classify as good match
    return similarity

# Example usage:
resume_text_matching = """
    Experienced software engineer with a passion for building scalable systems and solving complex problems.
    Proficient in Python, Java, and cloud technologies like AWS.
    Strong background in algorithms, data structures, and software development methodologies.
"""
resume_text_slightly_matching = """
Experienced software engineer with a strong background in designing and developing software solutions. Proficient in programming languages like Python and Java. Skilled in problem-solving, algorithms, and data structures. I have worked on several projects involving web development and mobile app development. Familiar with Git for version control and experienced in working in agile teams. Looking to apply my technical skills in a new role.
"""

resume_text_drastically_matching = """
Highly skilled artist with a passion for digital painting and illustration. Experienced in Adobe Photoshop, Procreate, and other graphic design tools. Strong portfolio showcasing a variety of digital artwork, including character design and concept art. Avid gamer with a deep interest in fantasy art styles and storytelling through visual media. Looking to transition into a full-time role as a concept artist.
"""


job_desc_text = """
    We are looking for a software engineer with experience in cloud technologies and strong problem-solving skills.
    Proficiency in Python and Java is required, along with knowledge of AWS.
"""

# Classify the resume against the job description
result_matching = classify_resume(resume_text_matching, job_desc_text)
result_slightly_matching = classify_resume(resume_text_slightly_matching, job_desc_text)
result_drastically_matching = classify_resume(resume_text_drastically_matching, job_desc_text)

print(f"Similarity for result_matching :-" , result_matching)
print(f"Similarity for result_slightly_matching :-" , result_slightly_matching)
print(f"Similarity for result_drastically_matching :-" , result_drastically_matching)


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

wcss = []

# Try cluster sizes from 1 to 10
for idx in range(1, 11):
    km = KMeans(n_clusters=idx, random_state=42)
    km.fit(X_train)              # No y_train for KMeans
    wcss.append(km.inertia_)     # Inertia = WCSS

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o', linestyle='-', color='b')
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of clusters (K)")
plt.ylabel("WCSS (Inertia)")
plt.grid(True)
plt.show()


In [None]:
knn = KMeans(n_clusters=2)
knn.fit(X_train)
y_pred = knn.predict(X_test)
print("Accuracy :-" , accuracy_score(y_test , y_pred))
print("R2 score :-" , r2_score(y_test , y_pred))