In [1]:
# notebooks/4_complete_ml_pipeline.ipynb - FIXED VERSION
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('../data/employee_attrition.csv')
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

print("=== COMPLETE ML PIPELINE ===")

# 1. SELECT ONLY NUMERICAL FEATURES FOR CORRELATION
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical features: {len(numerical_features)}")

# Calculate correlation with attrition using only numerical features
correlation_with_attrition = df[numerical_features].corr()['Attrition'].abs().sort_values(ascending=False)
top_numerical_features = correlation_with_attrition[1:11].index.tolist()  # Top 10 numerical features
print("Top numerical features by correlation:", top_numerical_features)

# 2. ADD IMPORTANT CATEGORICAL FEATURES (from our previous analysis)
important_categorical = ['OverTime', 'BusinessTravel', 'Department', 'JobRole']
print("Important categorical features:", important_categorical)

# 3. COMBINE FEATURES
top_features = top_numerical_features + important_categorical
print(f"Final feature set ({len(top_features)}): {top_features}")

# 4. PREPARE DATA
X = df[top_features].copy()
y = df['Attrition']

# Encode categorical features
label_encoders = {}
for col in important_categorical:
    if col in X.columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

# 5. SCALE FEATURES
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Data shape: {X_scaled.shape}")
print(f"Attrition distribution: {pd.Series(y).value_counts().to_dict()}")

# 6. HANDLE CLASS IMBALANCE WITH SMOTE
print("\n=== HANDLING CLASS IMBALANCE ===")
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_scaled, y)
print(f"After SMOTE - Class distribution: {pd.Series(y_balanced).value_counts().to_dict()}")

# 7. TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# 8. MODEL TRAINING WITH CROSS-VALIDATION
print("\n=== MODEL TRAINING & CROSS-VALIDATION ===")
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42)
}

cv_results = {}
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    cv_results[name] = {
        'mean_f1': cv_scores.mean(),
        'std_f1': cv_scores.std()
    }
    print(f"{name}: F1 = {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# 9. TRAIN BEST MODEL ON FULL TRAINING SET
best_model_name = max(cv_results, key=lambda x: cv_results[x]['mean_f1'])
print(f"\n🎯 BEST MODEL: {best_model_name}")

best_model = models[best_model_name]
best_model.fit(X_train, y_train)

# 10. COMPREHENSIVE EVALUATION
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("\n=== COMPREHENSIVE EVALUATION ===")
print("Classification Report:")
print(classification_report(y_test, y_pred))

print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.3f}")

# 11. FEATURE IMPORTANCE
print("\n🎯 FEATURE IMPORTANCE:")
if best_model_name == 'Logistic Regression':
    feature_importance = pd.DataFrame({
        'feature': top_features,
        'coefficient': best_model.coef_[0],
        'abs_impact': np.abs(best_model.coef_[0])
    }).sort_values('abs_impact', ascending=False)
else:
    feature_importance = pd.DataFrame({
        'feature': top_features,
        'coefficient': best_model.feature_importances_
    }).sort_values('coefficient', ascending=False)

print(feature_importance.head(10))

# 12. SAVE COMPLETE PIPELINE
import pickle
with open('../models/final_model.pkl', 'wb') as f:
    pickle.dump({
        'model': best_model,
        'scaler': scaler,
        'feature_names': top_features,
        'label_encoders': label_encoders,
        'performance': {
            'roc_auc': roc_auc_score(y_test, y_pred_proba),
            'cv_f1': cv_results[best_model_name]['mean_f1']
        }
    }, f)

print("💾 COMPLETE ML PIPELINE SAVED!")
print("✅ ALL ML PROCEDURES COMPLETED:")
print("   - Feature selection & engineering")
print("   - Feature scaling")
print("   - Class imbalance handling (SMOTE)")
print("   - Cross-validation")
print("   - Multiple evaluation metrics")
print("   - Model interpretation")

=== COMPLETE ML PIPELINE ===
Numerical features: 27
Top numerical features by correlation: ['TotalWorkingYears', 'JobLevel', 'YearsInCurrentRole', 'MonthlyIncome', 'Age', 'YearsWithCurrManager', 'StockOptionLevel', 'YearsAtCompany', 'JobInvolvement', 'JobSatisfaction']
Important categorical features: ['OverTime', 'BusinessTravel', 'Department', 'JobRole']
Final feature set (14): ['TotalWorkingYears', 'JobLevel', 'YearsInCurrentRole', 'MonthlyIncome', 'Age', 'YearsWithCurrManager', 'StockOptionLevel', 'YearsAtCompany', 'JobInvolvement', 'JobSatisfaction', 'OverTime', 'BusinessTravel', 'Department', 'JobRole']
Data shape: (1470, 14)
Attrition distribution: {0: 1233, 1: 237}

=== HANDLING CLASS IMBALANCE ===
After SMOTE - Class distribution: {1: 1233, 0: 1233}

=== MODEL TRAINING & CROSS-VALIDATION ===
Logistic Regression: F1 = 0.748 (+/- 0.037)
Random Forest: F1 = 0.918 (+/- 0.012)

🎯 BEST MODEL: Random Forest

=== COMPREHENSIVE EVALUATION ===
Classification Report:
              precisi