In [3]:
# notebooks/3_model_training.ipynb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/employee_attrition.csv')

print("=== BUILDING ATTRITION PREDICTION MODEL ===")
print(f"Target: Predict the {df['Attrition'].value_counts()['Yes']} employees who will leave")

# Convert target to binary
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# Select features based on our HR analysis
features = ['MonthlyIncome', 'OverTime', 'YearsAtCompany', 'JobSatisfaction', 
           'WorkLifeBalance', 'YearsSinceLastPromotion', 'BusinessTravel',
           'JobRole', 'Department', 'PerformanceRating']

print(f"Using {len(features)} key features identified from HR analysis")
print("Features:", features)

=== BUILDING ATTRITION PREDICTION MODEL ===
Target: Predict the 237 employees who will leave
Using 10 key features identified from HR analysis
Features: ['MonthlyIncome', 'OverTime', 'YearsAtCompany', 'JobSatisfaction', 'WorkLifeBalance', 'YearsSinceLastPromotion', 'BusinessTravel', 'JobRole', 'Department', 'PerformanceRating']


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Two models for comparison
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

print("We'll compare:")
print("1. Random Forest - For high accuracy")
print("2. Logistic Regression - For interpretable insights")
print("3. Feature Importance - To show key drivers (like we found: Income, Overtime, etc.)")


We'll compare:
1. Random Forest - For high accuracy
2. Logistic Regression - For interpretable insights
3. Feature Importance - To show key drivers (like we found: Income, Overtime, etc.)


In [5]:


# Feature Engineering
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Prepare features - handle categorical variables
X = df[features].copy()
y = df['Attrition']

# Encode categorical variables
label_encoders = {}
categorical_cols = ['OverTime', 'BusinessTravel', 'JobRole', 'Department']

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

print("Feature engineering completed:")
print(f"X shape: {X.shape}, y shape: {y.shape}")
print(f"Categorical features encoded: {categorical_cols}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Attrition in training: {y_train.value_counts().to_dict()}")

Feature engineering completed:
X shape: (1470, 10), y shape: (1470,)
Categorical features encoded: ['OverTime', 'BusinessTravel', 'JobRole', 'Department']

Training set: 1176 samples
Test set: 294 samples
Attrition in training: {0: 986, 1: 190}


In [6]:
# ADD TO notebooks/3_model_training.ipynb

print("=== TRAINING MODELS ===")

# Train and evaluate models
results = {}

for name, model in models.items():
    print(f"\n--- Training {name} ---")
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of attrition
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.3f}")
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }

print(f"\n=== MODEL COMPARISON ===")
for name, result in results.items():
    print(f"{name}: {result['accuracy']:.3f} accuracy")

# Show detailed performance
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
print(f"\n🎯 BEST MODEL: {best_model_name}")

=== TRAINING MODELS ===

--- Training Random Forest ---
Accuracy: 0.837

--- Training Logistic Regression ---
Accuracy: 0.854

=== MODEL COMPARISON ===
Random Forest: 0.837 accuracy
Logistic Regression: 0.854 accuracy

🎯 BEST MODEL: Logistic Regression


In [8]:
# ADD FINAL ANALYSIS - CORRECTED VERSION
import pandas as pd
import numpy as np

print("=== FINAL HR RECOMMENDATIONS ===")
print("🚨 PRIORITY 1: ADDRESS OVERTIME")
print(f"   • Overtime increases attrition risk by {np.exp(1.292):.1f}x")
print("   • Action: Hire more staff, redistribute workload")

print("\n🎯 PRIORITY 2: IMPROVE WORK-LIFE BALANCE")  
print("   • Every 1-point improvement in WorkLifeBalance reduces attrition risk")
print("   • Action: Flexible hours, remote work options")

print("\n💼 PRIORITY 3: DEPARTMENT-SPECIFIC SOLUTIONS")
print("   • Sales department needs immediate attention (20.6% attrition)")
print("   • Action: Sales-specific retention bonuses, better management")

print("\n📈 PRIORITY 4: CAREER DEVELOPMENT")
print("   • Employees waiting for promotions are at higher risk")
print("   • Action: Clear promotion paths, regular career conversations")

# FIRST define feature_importance, THEN use it
lr_model = results['Logistic Regression']['model']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': lr_model.coef_[0],
    'abs_impact': np.abs(lr_model.coef_[0])
}).sort_values('abs_impact', ascending=False)

# NOW calculate risk multipliers
risk_multipliers = pd.DataFrame({
    'feature': feature_importance['feature'],
    'risk_multiplier': np.exp(feature_importance['coefficient']),
    'impact': feature_importance['coefficient'].apply(
        lambda x: f"INCREASES risk by {np.exp(x):.1f}x" if x > 0 else f"REDUCES risk by {1/np.exp(x):.1f}x"
    )
})

print(f"\n📊 QUANTIFIED BUSINESS IMPACT:")
for _, row in risk_multipliers.head(3).iterrows():
    print(f"   • {row['feature']}: {row['impact']}")

=== FINAL HR RECOMMENDATIONS ===
🚨 PRIORITY 1: ADDRESS OVERTIME
   • Overtime increases attrition risk by 3.6x
   • Action: Hire more staff, redistribute workload

🎯 PRIORITY 2: IMPROVE WORK-LIFE BALANCE
   • Every 1-point improvement in WorkLifeBalance reduces attrition risk
   • Action: Flexible hours, remote work options

💼 PRIORITY 3: DEPARTMENT-SPECIFIC SOLUTIONS
   • Sales department needs immediate attention (20.6% attrition)
   • Action: Sales-specific retention bonuses, better management

📈 PRIORITY 4: CAREER DEVELOPMENT
   • Employees waiting for promotions are at higher risk
   • Action: Clear promotion paths, regular career conversations

📊 QUANTIFIED BUSINESS IMPACT:
   • OverTime: INCREASES risk by 3.6x
   • Department: INCREASES risk by 1.7x
   • WorkLifeBalance: REDUCES risk by 1.5x
