In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. LOAD THE DATA
df = pd.read_csv('labeled_resumes.csv')

# 2. AUTOMATIC COLUMN CHECK (Fixes the KeyError)
# This looks for 'Label', 'label', or 'quality_score' automatically
if 'Label' in df.columns:
    target_col = 'Label'
elif 'label' in df.columns:
    target_col = 'label'
else:
    # If neither exists, we use the quality score to re-create them
    print("Warning: Label column not found. Re-creating labels from scores...")
    df['label'] = df['quality_score'].apply(lambda x: "Good" if x >= 75 else "Average" if x >= 45 else "Poor")
    target_col = 'label'

print(f"Using '{target_col}' as the target column for AI training.")

# 3. TEXT TO NUMBERS (TF-IDF)
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X = tfidf.fit_transform(df['Resume_str']) 
y = df[target_col] # Using the correct column name found above

# 4. SPLIT THE DATA (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. TRAIN THE MODEL (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
print("Training the AI model... this may take a moment.")
model.fit(X_train, y_train)

# 6. EVALUATE
y_pred = model.predict(X_test)

print("\n--- DAY 9 SUCCESS: MODEL PERFORMANCE ---")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Using 'label' as the target column for AI training.
Training the AI model... this may take a moment.

--- DAY 9 SUCCESS: MODEL PERFORMANCE ---
Accuracy Score: 83.50%

Classification Report:
              precision    recall  f1-score   support

     Average       0.83      0.06      0.11        86
        Poor       0.84      1.00      0.91       411

    accuracy                           0.84       497
   macro avg       0.83      0.53      0.51       497
weighted avg       0.83      0.84      0.77       497



In [None]:
"""import pandas as pd  # Library to handle our data in table format
import joblib       # Library to save our AI model to a file for later use
from sklearn.model_selection import train_test_split  # Tool to split data into Train/Test sets
from sklearn.feature_extraction.text import TfidfVectorizer # Tool to turn words into numbers
from sklearn.ensemble import RandomForestClassifier # The AI algorithm (The "Brain")
from sklearn.metrics import classification_report, accuracy_score # Tools to measure performance

# 1. LOAD THE DATA
# We load the CSV we created on Day 8
df = pd.read_csv('labeled_resumes.csv') 

# 2. DATA CLEANING & SELECTION
# We identify which column is the "Target" (the answers the AI needs to learn)
if 'label' in df.columns:
    target_col = 'label'
else:
    target_col = 'Label'

# 3. TEXT VECTORIZATION (TF-IDF)
# Why: Computers don't read English; they read math. 
# TfidfVectorizer calculates which words are most unique and important in a resume.
tfidf = TfidfVectorizer(max_features=1000, stop_words='english') 

# We transform the 'Resume_str' text column into a matrix of numbers
X = tfidf.fit_transform(df['Resume_str']) 

# We set our 'y' as the labels (Good, Average, Poor)
y = df[target_col] 

# 4. SPLITTING THE DATA
# Why: We use 80% to train the AI and 20% to "exam" it later.
# random_state=42 ensures we get the same split every time we run the code.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. INITIALIZING THE MODEL
# RandomForest is a "Forest" of 100 Decision Trees that vote on the final result.
model = RandomForestClassifier(n_estimators=100, random_state=42)

# 6. TRAINING (THE "LEARNING" PHASE)
# This is where the AI looks at X (numbers) and y (labels) to find patterns.
model.fit(X_train, y_train) 

# 7. EVALUATION (THE "EXAM" PHASE)
# We ask the AI to guess the labels for the 20% of data it hasn't seen yet.
y_pred = model.predict(X_test) 

# 8. PRINTING RESULTS
# accuracy_score tells us the percentage of correct guesses (You got 83.5%!)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred) * 100:.2f}%")
# classification_report shows where the AI is strong or weak (Good vs Poor)
print(classification_report(y_test, y_pred))

# 9. SAVING THE BRAIN
# We save the 'model' and the 'tfidf' converter as files so we don't have to re-train.
joblib.dump(model, 'resume_model.pkl') 
joblib.dump(tfidf, 'tfidf_vectorizer.pkl') 

print("Success! AI Model saved to your MacBook.")"""