In [2]:
# notebooks/4_complete_ml_pipeline.ipynb - FIXED VERSION
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('../data/employee_attrition.csv')
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

print("=== COMPLETE ML PIPELINE ===")

# 1. SELECT ONLY NUMERICAL FEATURES FOR CORRELATION
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical features: {len(numerical_features)}")

# Calculate correlation with attrition using only numerical features
correlation_with_attrition = df[numerical_features].corr()['Attrition'].abs().sort_values(ascending=False)
top_numerical_features = correlation_with_attrition[1:11].index.tolist()  # Top 10 numerical features
print("Top numerical features by correlation:", top_numerical_features)

# 2. ADD IMPORTANT CATEGORICAL FEATURES (from our previous analysis)
important_categorical = ['OverTime', 'BusinessTravel', 'Department', 'JobRole']
print("Important categorical features:", important_categorical)

# 3. COMBINE FEATURES
top_features = top_numerical_features + important_categorical
print(f"Final feature set ({len(top_features)}): {top_features}")

# 4. PREPARE DATA
X = df[top_features].copy()
y = df['Attrition']

# Encode categorical features
label_encoders = {}
for col in important_categorical:
    if col in X.columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

# 5. SCALE FEATURES
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Data shape: {X_scaled.shape}")
print(f"Attrition distribution: {pd.Series(y).value_counts().to_dict()}")

# IN notebooks/4_complete_ml_pipeline.ipynb - REPLACE SECTION 6-8:

# === CORRECTED VERSION ===

# 6. PROPER TRAIN-TEST SPLIT (FIRST!)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
print(f"Before SMOTE - Train: {y_train.value_counts().to_dict()}, Test: {y_test.value_counts().to_dict()}")

# 7. HANDLE CLASS IMBALANCE ONLY ON TRAINING DATA
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
print(f"After SMOTE (train only): {pd.Series(y_train_balanced).value_counts().to_dict()}")

# 8. MODEL TRAINING WITH CROSS-VALIDATION (ON BALANCED TRAIN DATA)
print("\n=== REALISTIC MODEL PERFORMANCE ===")
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42)
}

cv_results = {}
for name, model in models.items():
    # Cross-val on balanced training data
    cv_scores = cross_val_score(model, X_train_balanced, y_train_balanced, cv=5, scoring='f1')
    cv_results[name] = {
        'mean_f1': cv_scores.mean(),
        'std_f1': cv_scores.std()
    }
    print(f"{name}: F1 = {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Train best model on balanced training data
best_model_name = max(cv_results, key=lambda x: cv_results[x]['mean_f1'])
print(f"\n🎯 BEST MODEL: {best_model_name}")

best_model = models[best_model_name]
best_model.fit(X_train_balanced, y_train_balanced)

# Evaluate on UNTOUCHED test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("\n=== REAL TEST SET PERFORMANCE ===")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.3f}")

=== COMPLETE ML PIPELINE ===
Numerical features: 27
Top numerical features by correlation: ['TotalWorkingYears', 'JobLevel', 'YearsInCurrentRole', 'MonthlyIncome', 'Age', 'YearsWithCurrManager', 'StockOptionLevel', 'YearsAtCompany', 'JobInvolvement', 'JobSatisfaction']
Important categorical features: ['OverTime', 'BusinessTravel', 'Department', 'JobRole']
Final feature set (14): ['TotalWorkingYears', 'JobLevel', 'YearsInCurrentRole', 'MonthlyIncome', 'Age', 'YearsWithCurrManager', 'StockOptionLevel', 'YearsAtCompany', 'JobInvolvement', 'JobSatisfaction', 'OverTime', 'BusinessTravel', 'Department', 'JobRole']
Data shape: (1470, 14)
Attrition distribution: {0: 1233, 1: 237}
Before SMOTE - Train: {0: 986, 1: 190}, Test: {0: 247, 1: 47}
After SMOTE (train only): {0: 986, 1: 986}

=== REALISTIC MODEL PERFORMANCE ===
Logistic Regression: F1 = 0.740 (+/- 0.039)
Random Forest: F1 = 0.913 (+/- 0.185)

🎯 BEST MODEL: Random Forest

=== REAL TEST SET PERFORMANCE ===
Classification Report:
        

In [4]:
# ADD TO THE END OF notebooks/4_complete_ml_pipeline.ipynb:

# === ACTIONABLE HR OUTPUT ===

print("\n" + "="*50)
print("🎯 ACTIONABLE HR INSIGHTS")
print("="*50)

# Predict probabilities for all employees
final_scaler = StandardScaler()
X_final_scaled = final_scaler.fit_transform(X)

# Retrain model on all data for deployment
deployment_model = RandomForestClassifier(random_state=42)
deployment_model.fit(X_final_scaled, y)

# Get risk scores for all employees
attrition_probs = deployment_model.predict_proba(X_final_scaled)[:, 1]

# Create HR action list
hr_results = pd.DataFrame({
    'EmployeeNumber': df['EmployeeNumber'],
    'AttritionProbability': attrition_probs,
    'RiskLevel': pd.cut(attrition_probs, [0, 0.3, 0.7, 1], labels=['LOW', 'MEDIUM', 'HIGH']),
    'Department': df['Department'],
    'JobRole': df['JobRole'],
    'MonthlyIncome': df['MonthlyIncome'],
    'OverTime': df['OverTime']
})

# 1. TOP 30 AT-RISK EMPLOYEES
print("🚨 TOP 30 AT-RISK EMPLOYEES (IMMEDIATE ACTION NEEDED):")
top_30 = hr_results.nlargest(30, 'AttritionProbability')
print(top_30[['EmployeeNumber', 'AttritionProbability', 'Department', 'JobRole']].head(10).to_string(index=False))

# 2. HIGH PERFORMER ATTRITION RISK
high_performers = hr_results[hr_results['MonthlyIncome'] > hr_results['MonthlyIncome'].median()]
high_risk_high_performers = high_performers[high_performers['RiskLevel'] == 'HIGH']

print(f"\n💎 HIGH PERFORMERS AT RISK: {len(high_risk_high_performers)} employees")
print("These are your MOST VALUABLE at-risk employees:")
print(high_risk_high_performers[['EmployeeNumber', 'AttritionProbability', 'JobRole', 'MonthlyIncome']].head(10).to_string(index=False))

# 3. DEPARTMENT CRISIS LEVELS
print(f"\n📊 DEPARTMENT ATTRITION CRISIS LEVELS:")
dept_crisis = hr_results.groupby('Department').agg({
    'AttritionProbability': 'mean',
    'EmployeeNumber': 'count',
    'RiskLevel': lambda x: (x == 'HIGH').sum()
}).round(3)
dept_crisis.columns = ['AvgRisk', 'TotalEmployees', 'HighRiskCount']
dept_crisis['HighRisk%'] = (dept_crisis['HighRiskCount'] / dept_crisis['TotalEmployees'] * 100).round(1)
print(dept_crisis.sort_values('HighRisk%', ascending=False))

# 4. COST ANALYSIS
avg_replacement_cost = 50000
expected_losses = hr_results['AttritionProbability'].sum()
total_risk_cost = expected_losses * avg_replacement_cost

print(f"\n💰 FINANCIAL IMPACT ANALYSIS:")
print(f"Expected employees to leave: {expected_losses:.1f}")
print(f"Estimated replacement cost: ${total_risk_cost:,.0f}")
print(f"High-risk employees: {(hr_results['RiskLevel'] == 'HIGH').sum()}")



🎯 ACTIONABLE HR INSIGHTS
🚨 TOP 30 AT-RISK EMPLOYEES (IMMEDIATE ACTION NEEDED):
 EmployeeNumber  AttritionProbability             Department               JobRole
            614                  0.99                  Sales  Sales Representative
           1783                  0.98 Research & Development Laboratory Technician
            959                  0.97                  Sales  Sales Representative
           1273                  0.97                  Sales  Sales Representative
           1624                  0.97                  Sales  Sales Representative
            167                  0.95                  Sales  Sales Representative
            235                  0.95                  Sales  Sales Representative
           1331                  0.95                  Sales  Sales Representative
            622                  0.93 Research & Development Laboratory Technician
           1016                  0.93 Research & Development    Research Scientist

💎 HIGH

In [5]:
# IN notebooks/4_complete_ml_pipeline.ipynb - REPLACE THE SAVE LINE:

import os

# Create results directory if it doesn't exist
os.makedirs('../results', exist_ok=True)

# Save for HR
hr_results.to_csv('../results/hr_attrition_risk_list.csv', index=False)
print(f"\n💾 HR Action List saved: 'hr_attrition_risk_list.csv'")


💾 HR Action List saved: 'hr_attrition_risk_list.csv'
