In [1]:
# notebooks/3_model_training.ipynb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/employee_attrition.csv')

print("=== BUILDING ATTRITION PREDICTION MODEL ===")
print(f"Target: Predict the {df['Attrition'].value_counts()['Yes']} employees who will leave")

# Convert target to binary
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# Select features based on our HR analysis
features = ['MonthlyIncome', 'OverTime', 'YearsAtCompany', 'JobSatisfaction', 
           'WorkLifeBalance', 'YearsSinceLastPromotion', 'BusinessTravel',
           'JobRole', 'Department', 'PerformanceRating']

print(f"Using {len(features)} key features identified from HR analysis")
print("Features:", features)

=== BUILDING ATTRITION PREDICTION MODEL ===
Target: Predict the 237 employees who will leave
Using 10 key features identified from HR analysis
Features: ['MonthlyIncome', 'OverTime', 'YearsAtCompany', 'JobSatisfaction', 'WorkLifeBalance', 'YearsSinceLastPromotion', 'BusinessTravel', 'JobRole', 'Department', 'PerformanceRating']


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Two models for comparison
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

print("We'll compare:")
print("1. Random Forest - For high accuracy")
print("2. Logistic Regression - For interpretable insights")
print("3. Feature Importance - To show key drivers (like we found: Income, Overtime, etc.)")


We'll compare:
1. Random Forest - For high accuracy
2. Logistic Regression - For interpretable insights
3. Feature Importance - To show key drivers (like we found: Income, Overtime, etc.)


In [3]:


# Feature Engineering
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Prepare features - handle categorical variables
X = df[features].copy()
y = df['Attrition']

# Encode categorical variables
label_encoders = {}
categorical_cols = ['OverTime', 'BusinessTravel', 'JobRole', 'Department']

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

print("Feature engineering completed:")
print(f"X shape: {X.shape}, y shape: {y.shape}")
print(f"Categorical features encoded: {categorical_cols}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Attrition in training: {y_train.value_counts().to_dict()}")

Feature engineering completed:
X shape: (1470, 10), y shape: (1470,)
Categorical features encoded: ['OverTime', 'BusinessTravel', 'JobRole', 'Department']

Training set: 1176 samples
Test set: 294 samples
Attrition in training: {0: 986, 1: 190}
