# Resume Screening System - ML Model Training

**Project:** AI Resume Screening System  
**Dataset:** 42,106 resumes  
**Algorithm:** K-Nearest Neighbors (KNN)  
**Goal:** Classify resumes into job categories  

---

## Overview

This notebook demonstrates a complete machine learning pipeline:
1. Load dataset (42K resumes)
2. Preprocess text data
3. Extract features using TF-IDF
4. Split into training (80%) and test (20%) sets
5. Train KNN classifier
6. Evaluate model performance
7. Make predictions

## Step 1: Install Dependencies

In [None]:
# Install required libraries
!pip install pandas numpy scikit-learn matplotlib seaborn -q
print("✅ All libraries installed successfully!")

## Step 2: Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("✅ All imports successful!")

## Step 3: Load Dataset

In [None]:
# Load the resume dataset
df = pd.read_csv('src/dataset/UpdatedResumeDataSet.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

## Step 4: Explore Data

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check unique categories
print(f"\nNumber of unique job categories: {df['Category'].nunique()}")

# Show category distribution
print("\nTop 10 job categories:")
print(df['Category'].value_counts().head(10))

In [None]:
# Visualize category distribution
plt.figure(figsize=(12, 6))
df['Category'].value_counts().head(15).plot(kind='barh')
plt.xlabel('Number of Resumes')
plt.ylabel('Job Category')
plt.title('Top 15 Job Categories in Dataset')
plt.tight_layout()
plt.show()

## Step 5: Preprocess Data

In [None]:
# Remove null values
df_clean = df.dropna()
print(f"Dataset after removing nulls: {df_clean.shape}")

# Prepare features and labels
X = df_clean['Resume'].values
y = df_clean['Category'].values

print(f"Features (X) shape: {X.shape}")
print(f"Labels (y) shape: {y.shape}")

## Step 6: Train-Test Split

In [None]:
# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Maintains class distribution
)

print(f"Training set size: {len(X_train):,} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test set size: {len(X_test):,} ({len(X_test)/len(X)*100:.1f}%)")

## Step 7: Feature Extraction - TF-IDF Vectorization

In [None]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,      # Use top 5000 words
    min_df=2,               # Min document frequency
    max_df=0.8,             # Max document frequency
    ngram_range=(1, 2),     # Use unigrams and bigrams
    stop_words='english'    # Remove common English words
)

# Fit on training data and transform both train and test
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")
print(f"Training matrix shape: {X_train_tfidf.shape}")
print(f"Test matrix shape: {X_test_tfidf.shape}")
print(f"\nSample features (words): {vectorizer.get_feature_names_out()[:20]}")

## Step 8: Train KNN Classifier

In [None]:
# Train KNN classifier
knn = KNeighborsClassifier(
    n_neighbors=5,
    metric='cosine',
    n_jobs=-1
)

print("Training KNN classifier...")
knn.fit(X_train_tfidf, y_train)
print("✅ Training complete!")

## Step 9: Make Predictions

In [None]:
# Make predictions on test set
y_pred = knn.predict(X_test_tfidf)
print(f"✅ Predictions made on {len(y_pred)} test samples")

## Step 10: Evaluate Model Performance

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall:    {recall:.4f} ({recall*100:.2f}%)")
print(f"F1 Score:  {f1:.4f}")
print("="*60)

## Step 11: Detailed Classification Report

In [None]:
# Print classification report
print("\nDETAILED CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred, zero_division=0))

## Step 12: Sample Predictions

In [None]:
# Show sample predictions
print("\nSAMPLE PREDICTIONS (First 20 test samples):")
print("-" * 70)
print(f"{'Actual Category':<30} {'Predicted':<30} {'Match'}")
print("-" * 70)

for i in range(min(20, len(y_test))):
    actual = y_test[i]
    predicted = y_pred[i]
    match = "✅" if actual == predicted else "❌"
    print(f"{actual:<30} {predicted:<30} {match}")

## Step 13: Prediction on New Resume

In [None]:
# Example: Predict category for a new resume
new_resume = """
Java Developer with 5 years experience.
Skills: Java, Spring Boot, Hibernate, Maven, JUnit, REST API, Microservices
Education: B.Tech in Computer Science
Experience: Developed enterprise applications using Spring framework
"""

# Vectorize the new resume
new_resume_tfidf = vectorizer.transform([new_resume])

# Get prediction
predicted_category = knn.predict(new_resume_tfidf)[0]

print("New Resume:")
print(new_resume)
print(f"\n✅ Predicted Category: {predicted_category}")

## Step 14: Model Summary

In [None]:
# Summary statistics
summary = {
    'Dataset': {
        'Total Resumes': len(df_clean),
        'Training Samples': len(X_train),
        'Test Samples': len(X_test),
        'Job Categories': df_clean['Category'].nunique()
    },
    'Feature Extraction': {
        'Method': 'TF-IDF',
        'Max Features': 5000,
        'N-gram Range': '(1, 2)',
        'Stop Words': 'English'
    },
    'Model': {
        'Algorithm': 'K-Nearest Neighbors',
        'K Neighbors': 5,
        'Distance Metric': 'Cosine'
    },
    'Performance': {
        'Accuracy': f"{accuracy*100:.2f}%",
        'Precision': f"{precision*100:.2f}%",
        'Recall': f"{recall*100:.2f}%",
        'F1 Score': f"{f1:.4f}"
    }
}

print("\n" + "="*60)
print("PROJECT SUMMARY")
print("="*60)

for section, metrics in summary.items():
    print(f"\n{section}:")
    for key, value in metrics.items():
        print(f"  • {key}: {value}")

print("\n" + "="*60)