In [1]:
# notebooks/3_model_training.ipynb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/employee_attrition.csv')

print("=== BUILDING ATTRITION PREDICTION MODEL ===")
print(f"Target: Predict the {df['Attrition'].value_counts()['Yes']} employees who will leave")

# Convert target to binary
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# Select features based on our HR analysis
features = ['MonthlyIncome', 'OverTime', 'YearsAtCompany', 'JobSatisfaction', 
           'WorkLifeBalance', 'YearsSinceLastPromotion', 'BusinessTravel',
           'JobRole', 'Department', 'PerformanceRating']

print(f"Using {len(features)} key features identified from HR analysis")
print("Features:", features)

=== BUILDING ATTRITION PREDICTION MODEL ===
Target: Predict the 237 employees who will leave
Using 10 key features identified from HR analysis
Features: ['MonthlyIncome', 'OverTime', 'YearsAtCompany', 'JobSatisfaction', 'WorkLifeBalance', 'YearsSinceLastPromotion', 'BusinessTravel', 'JobRole', 'Department', 'PerformanceRating']


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Two models for comparison
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

print("We'll compare:")
print("1. Random Forest - For high accuracy")
print("2. Logistic Regression - For interpretable insights")
print("3. Feature Importance - To show key drivers (like we found: Income, Overtime, etc.)")


We'll compare:
1. Random Forest - For high accuracy
2. Logistic Regression - For interpretable insights
3. Feature Importance - To show key drivers (like we found: Income, Overtime, etc.)


In [3]:


# Feature Engineering
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Prepare features - handle categorical variables
X = df[features].copy()
y = df['Attrition']

# Encode categorical variables
label_encoders = {}
categorical_cols = ['OverTime', 'BusinessTravel', 'JobRole', 'Department']

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

print("Feature engineering completed:")
print(f"X shape: {X.shape}, y shape: {y.shape}")
print(f"Categorical features encoded: {categorical_cols}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Attrition in training: {y_train.value_counts().to_dict()}")

Feature engineering completed:
X shape: (1470, 10), y shape: (1470,)
Categorical features encoded: ['OverTime', 'BusinessTravel', 'JobRole', 'Department']

Training set: 1176 samples
Test set: 294 samples
Attrition in training: {0: 986, 1: 190}


In [4]:
# ADD TO notebooks/3_model_training.ipynb

print("=== TRAINING MODELS ===")

# Train and evaluate models
results = {}

for name, model in models.items():
    print(f"\n--- Training {name} ---")
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of attrition
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.3f}")
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }

print(f"\n=== MODEL COMPARISON ===")
for name, result in results.items():
    print(f"{name}: {result['accuracy']:.3f} accuracy")

# Show detailed performance
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
print(f"\n🎯 BEST MODEL: {best_model_name}")

=== TRAINING MODELS ===

--- Training Random Forest ---
Accuracy: 0.837

--- Training Logistic Regression ---
Accuracy: 0.854

=== MODEL COMPARISON ===
Random Forest: 0.837 accuracy
Logistic Regression: 0.854 accuracy

🎯 BEST MODEL: Logistic Regression


In [5]:
# ADD TO notebooks/3_model_training.ipynb

print("=== MODEL ANALYSIS & BUSINESS INSIGHTS ===")

# Get Logistic Regression coefficients for interpretability
lr_model = results['Logistic Regression']['model']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': lr_model.coef_[0],
    'abs_impact': np.abs(lr_model.coef_[0])
}).sort_values('abs_impact', ascending=False)

print("TOP ATTRITION DRIVERS (Logistic Regression Coefficients):")
print(feature_importance.head(10))

print("\n🔍 BUSINESS INTERPRETATION:")
print("POSITIVE coefficients INCREASE attrition risk")
print("NEGATIVE coefficients DECREASE attrition risk")

# Show what each feature means
feature_direction = []
for _, row in feature_importance.iterrows():
    direction = "INCREASES" if row['coefficient'] > 0 else "REDUCES"
    feature_direction.append(f"{row['feature']} {direction} attrition risk")

print("\n🎯 ACTIONABLE INSIGHTS:")
for insight in feature_direction[:5]:  # Top 5 drivers
    print(f"• {insight}")

# Save the best model
import pickle
with open('../models/trained_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)
print(f"\n💾 Model saved as 'trained_model.pkl'")

=== MODEL ANALYSIS & BUSINESS INSIGHTS ===
TOP ATTRITION DRIVERS (Logistic Regression Coefficients):
                   feature  coefficient  abs_impact
1                 OverTime     1.292062    1.292062
8               Department     0.503292    0.503292
4          WorkLifeBalance    -0.403695    0.403695
3          JobSatisfaction    -0.276380    0.276380
5  YearsSinceLastPromotion     0.117695    0.117695
2           YearsAtCompany    -0.115646    0.115646
9        PerformanceRating     0.111327    0.111327
6           BusinessTravel    -0.049722    0.049722
7                  JobRole    -0.031971    0.031971
0            MonthlyIncome    -0.000117    0.000117

🔍 BUSINESS INTERPRETATION:
POSITIVE coefficients INCREASE attrition risk
NEGATIVE coefficients DECREASE attrition risk

🎯 ACTIONABLE INSIGHTS:
• OverTime INCREASES attrition risk
• Department INCREASES attrition risk
• WorkLifeBalance REDUCES attrition risk
• JobSatisfaction REDUCES attrition risk
• YearsSinceLastPromotion I