In [None]:
# =============================================================================
#                             ATS RESUME ANALYZER
# =============================================================================
# This script trains a neural network to classify resumes into job categories.
# It then uses this model to analyze a user's resume, provide a compatibility
# score for a chosen job, and offer feedback on missing keywords.
# =============================================================================


# --- Step 1: Import Necessary Libraries ---
# -----------------------------------------------------------------------------
# We import all the tools we'll need for data handling, text processing,
# building the neural network, and interacting with the user.

import pandas as pd
import numpy as np
import re
import os # To check if the model file exists
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from PyPDF2 import PdfReader


# --- Step 2: Data Loading and Preparation ---
# -----------------------------------------------------------------------------
# Here, we load the datasets of resumes and job descriptions. The core task is
# to prepare the data for the neural network by cleaning the text and
# converting the job categories (labels) from text to numbers.

print("--- Loading and Preparing Data ---")
# Load resumes (from 'job_descriptions.csv') and job descriptions
resume_data = pd.read_csv('job_descriptions.csv')
resume_data = resume_data[['Category', 'Resume']].dropna()
jobs_data = pd.read_csv('UpdatedResumeDataSet.csv')
jobs_data = jobs_data[['jobtitle', 'jobdescription']].dropna().rename(columns={'jobtitle': 'Category'})

# Convert text labels (e.g., "Data Science") into numerical labels (e.g., 6)
label_encoder = LabelEncoder()
resume_data['category_encoded'] = label_encoder.fit_transform(resume_data['Category'])

# Define our features (X) and labels (y)
X = resume_data['Resume']
y = resume_data['category_encoded']
num_classes = len(label_encoder.classes_)

# Define a function to clean the raw text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters
    text = re.sub(r'\b[a-zA-Z]\b', '', text)   # Remove single-letter words
    return text

# Apply the cleaning function and tokenize the text (convert words to numbers)
X_cleaned = X.apply(clean_text)
tokenizer = Tokenizer(num_words=5000, oov_token="<unk>")
tokenizer.fit_on_texts(X_cleaned)
vocab_size = len(tokenizer.word_index) + 1

# Pad all sequences to be the same length for the neural network
MAX_SEQUENCE_LENGTH = 300
X_padded = pad_sequences(tokenizer.texts_to_sequences(X_cleaned), maxlen=MAX_SEQUENCE_LENGTH, padding='post')


# --- Step 3: Build or Load the Neural Network Model ---
# -----------------------------------------------------------------------------
# This is the core of the AI. We check if a pre-trained model exists. If not,
# we define the architecture of our neural network (layers, etc.) and compile it.
# This architecture is designed to understand sequences of text.

MODEL_FILE_PATH = 'ats_classifier_model.keras'

if os.path.exists(MODEL_FILE_PATH):
    print("--- Loading Pre-trained Model ---")
    model = load_model(MODEL_FILE_PATH)
else:
    print("--- Building New Neural Network Model ---")
    EMBEDDING_DIM = 128
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM),
        Bidirectional(LSTM(128, return_sequences=False)),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dense(num_classes, activation='softmax') # Softmax for multi-class classification
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # --- Step 4: Train the Model (only if not loaded) ---
    # -------------------------------------------------------------------------
    # If we built a new model, we train it on our data. We use EarlyStopping
    # to prevent overfitting, automatically stopping when performance peaks.
    # After training, we save the model for future use.

    print("--- Training the Classifier with Early Stopping ---")
    X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32, callbacks=[early_stopping], verbose=2)
    
    # Save the newly trained model
    model.save(MODEL_FILE_PATH)
    print(f"--- Model Trained and Saved as {MODEL_FILE_PATH} ---")


# --- Step 5: The Interactive Prediction Workflow ---
# -----------------------------------------------------------------------------
# This is the user-facing part of the application. It contains helper functions
# for analyzing the resume and then runs a loop to get input from the user.

def get_contextual_keywords(job_category, jobs_df):
    """Extracts keywords that are highly specific to a given job category."""
    target_docs = jobs_df[jobs_df['Category'] == job_category]['jobdescription'].apply(clean_text)
    background_docs = jobs_df[jobs_df['Category'] != job_category]['jobdescription'].apply(clean_text)
    if target_docs.empty: return []

    vectorizer = TfidfVectorizer(max_features=2000, stop_words='english', ngram_range=(1,2))
    tfidf_target = vectorizer.fit_transform(target_docs)
    target_scores = np.array(tfidf_target.mean(axis=0)).ravel()

    vectorizer_bg = TfidfVectorizer(vocabulary=vectorizer.vocabulary_, stop_words='english')
    tfidf_bg = vectorizer_bg.fit_transform(background_docs)
    bg_scores = np.array(tfidf_bg.mean(axis=0)).ravel()

    specificity_scores = target_scores / (bg_scores + 1e-6)
    sorted_indices = specificity_scores.argsort()[::-1]
    feature_names = vectorizer.get_feature_names_out()
    return [feature_names[i] for i in sorted_indices[:15]]

def get_missing_keywords(resume_text, job_category, jobs_df):
    """Finds important, context-aware keywords missing from the resume."""
    job_keywords = get_contextual_keywords(job_category, jobs_df)
    cleaned_resume = clean_text(resume_text)
    return [word for word in job_keywords if word not in cleaned_resume]

def get_harsh_ats_score(original_score_percent):
    """Converts the model's confidence into a stricter, tiered score."""
    score = original_score_percent
    if score >= 90: return np.random.randint(92, 98), "Excellent Fit"
    elif score >= 75: return np.random.randint(85, 91), "Strong Candidate"
    elif score >= 50: return np.random.randint(70, 82), "Needs Improvement"
    elif score >= 25: return np.random.randint(55, 68), "Significant Gaps"
    else: return np.random.randint(40, 52), "Not a Match"

def read_pdf_text(file_path):
    """Extracts text content from a user-provided PDF file."""
    try:
        reader = PdfReader(file_path)
        return "".join(page.extract_text() for page in reader.pages)
    except FileNotFoundError:
        print(f"\nError: The file '{file_path}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred reading the PDF: {e}")
        return None
        
# --- Main Interactive Loop ---
# This loop runs the program, asking the user for input and providing the final report.

available_jobs = list(label_encoder.classes_)
print("\n--- ATS Resume Analyzer ---")
print("Available Job Categories:")
for job in available_jobs: print(f"  {job}")

while True:
    print("\nPlease enter the job category you are interested in:")
    user_choice = input("> ")
    if user_choice in available_jobs: break
    else: print(f"\nError: '{user_choice}' is not a valid category.")

while True:
    print("\nPlease enter the name of your resume PDF file (e.g., my_resume.pdf):")
    my_resume_path = input("> ")
    my_resume_text = read_pdf_text(my_resume_path)
    if my_resume_text: break

# --- Generate and Display the Final Report ---
# 1. Get model's raw prediction
cleaned_resume_text = clean_text(my_resume_text)
sequence = tokenizer.texts_to_sequences([cleaned_resume_text])
padded_sequence = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
predictions = model.predict(padded_sequence, verbose=0)[0]
chosen_category_index = list(label_encoder.classes_).index(user_choice)
compatibility_score = predictions[chosen_category_index] * 100

# 2. Get the harsh score and keyword feedback
harsh_score, rating = get_harsh_ats_score(compatibility_score)
missing = get_missing_keywords(my_resume_text, user_choice, jobs_data)

# 3. Display the full, formatted report
print("\n" + "="*40)
print("      ATS ANALYSIS REPORT (Harsh)")
print("="*40)
print(f"JOB TARGETED: {user_choice}")
print(f"FINAL ATS SCORE: {harsh_score} / 100")
print(f"RATING: {rating}")
print("----------------------------------------")
if missing:
    print("ANALYSIS: Resume lacks keywords specific to this role.")
    print("Consider highlighting skills and technologies like:")
    for keyword in missing:
        print(f"  - {keyword.title()}")
else:
    print("ANALYSIS: Strong keyword alignment with the target role.")
print("----------------------------------------")
print(f"(Note: Model's raw compatibility confidence was {compatibility_score:.1f}%)")
print("="*40)

--- Loading and Preparing Data ---
--- Building New Neural Network Model ---
--- Training the Classifier with Early Stopping ---
Epoch 1/50
25/25 - 11s - 437ms/step - accuracy: 0.1235 - loss: 3.1410 - val_accuracy: 0.0933 - val_loss: 3.2222
Epoch 2/50
25/25 - 6s - 247ms/step - accuracy: 0.2536 - loss: 2.9493 - val_accuracy: 0.1658 - val_loss: 2.8362
Epoch 3/50
25/25 - 6s - 240ms/step - accuracy: 0.3121 - loss: 2.3910 - val_accuracy: 0.4404 - val_loss: 1.9923
Epoch 4/50
25/25 - 7s - 266ms/step - accuracy: 0.5254 - loss: 1.6711 - val_accuracy: 0.6839 - val_loss: 1.2770
Epoch 5/50
25/25 - 6s - 240ms/step - accuracy: 0.7269 - loss: 1.0665 - val_accuracy: 0.7772 - val_loss: 0.8418
Epoch 6/50
25/25 - 6s - 241ms/step - accuracy: 0.8453 - loss: 0.6026 - val_accuracy: 0.8756 - val_loss: 0.5210
Epoch 7/50
25/25 - 6s - 252ms/step - accuracy: 0.9194 - loss: 0.3646 - val_accuracy: 0.9016 - val_loss: 0.3423
Epoch 8/50
25/25 - 6s - 252ms/step - accuracy: 0.9467 - loss: 0.2337 - val_accuracy: 0.9430 -