<a href="https://colab.research.google.com/github/sivashankariramanimohan/employee_turnover_analysis/blob/main/Employee_Turnover_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Employee Turnover Analytics - Complete Solution
# Portobello Tech - ML Course-End Project

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("Google Drive mounted successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully!


# STEP 1: DATA LOADING AND QUALITY CHECKS

In [None]:
# Load the dataset
data_path = '/content/drive/My Drive/Simplilearn_project/HR_comma_sep.csv'
df = pd.read_csv(data_path)

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nDataset Description:")
print(df.describe())

# 1. Data Quality Checks - Missing Values
print("\n" + "="*50)
print("STEP 1: DATA QUALITY CHECKS")
print("="*50)

print("\nMissing Values Check:")
missing_values = df.isnull().sum()
print(missing_values)

if missing_values.sum() == 0:
    print("✓ No missing values found in the dataset!")
else:
    print("⚠ Missing values detected!")

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

# Basic statistics
print(f"\nBasic Dataset Statistics:")
print(f"Total employees: {len(df)}")
print(f"Employees who left: {df['left'].sum()}")
print(f"Employees who stayed: {len(df) - df['left'].sum()}")
print(f"Turnover rate: {df['left'].mean():.2%}")

Dataset loaded successfully!
Dataset shape: (14999, 10)

First 5 rows:
   satisfaction_level  last_evaluation  number_project  average_montly_hours  \
0                0.38             0.53               2                   157   
1                0.80             0.86               5                   262   
2                0.11             0.88               7                   272   
3                0.72             0.87               5                   223   
4                0.37             0.52               2                   159   

   time_spend_company  Work_accident  left  promotion_last_5years  sales  \
0                   3              0     1                      0  sales   
1                   6              0     1                      0  sales   
2                   4              0     1                      0  sales   
3                   5              0     1                      0  sales   
4                   3              0     1                      0  s

STEP 2: EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
print("\n" + "="*50)
print("STEP 2: EXPLORATORY DATA ANALYSIS")
print("="*50)

# 2.1 Correlation Heatmap
plt.figure(figsize=(12, 8))
numerical_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Correlation Matrix Heatmap - Numerical Features')
plt.tight_layout()
plt.show()

e('Distribution of Employee Last Evaluation')
axes[1].set_xlabel('Last Evaluation Score')
axes[1].set_ylabel('Frequency')

# Average Monthly Hours# 2.2 Distribution Plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Employee Satisfaction Distribution
axes[0].hist(df['satisfaction_level'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0].set_title('Distribution of Employee Satisfaction Level')
axes[0].set_xlabel('Satisfaction Level')
axes[0].set_ylabel('Frequency')

# Employee Evaluation Distribution
axes[1].hist(df['last_evaluation'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[1].set_title Distribution
axes[2].hist(df['average_montly_hours'], bins=30, alpha=0.7, color='salmon', edgecolor='black')
axes[2].set_title('Distribution of Average Monthly Hours')
axes[2].set_xlabel('Average Monthly Hours')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# 2.3 Bar Plot - Project Count by Employee Status
plt.figure(figsize=(10, 6))
project_left = df.groupby(['number_project', 'left']).size().unstack()
project_left.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Employee Project Count Distribution (Left vs Stayed)')
plt.xlabel('Number of Projects')
plt.ylabel('Number of Employees')
plt.legend(['Stayed', 'Left'])
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Insights from project count analysis
print("\nInsights from Project Count Analysis:")
project_analysis = df.groupby('number_project')['left'].agg(['count', 'sum', 'mean'])
project_analysis.columns = ['Total_Employees', 'Employees_Left', 'Turnover_Rate']
print(project_analysis)


SyntaxError: invalid syntax (<ipython-input-8-1926626817>, line 30)

STEP 3: CLUSTERING ANALYSIS

In [None]:
print("\n" + "="*50)
print("STEP 3: CLUSTERING ANALYSIS")
print("="*50)

# 3.1 & 3.2 K-means Clustering for employees who left
left_employees = df[df['left'] == 1][['satisfaction_level', 'last_evaluation']].copy()
print(f"Number of employees who left: {len(left_employees)}")

# Perform K-means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
left_employees['cluster'] = kmeans.fit_predict(left_employees)

# Visualize clusters
plt.figure(figsize=(12, 8))
colors = ['red', 'blue', 'green']
for i in range(3):
    cluster_data = left_employees[left_employees['cluster'] == i]
    plt.scatter(cluster_data['satisfaction_level'], cluster_data['last_evaluation'],
                c=colors[i], label=f'Cluster {i}', alpha=0.6)

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
            c='black', marker='x', s=200, linewidths=3, label='Centroids')
plt.xlabel('Satisfaction Level')
plt.ylabel('Last Evaluation')
plt.title('K-means Clustering of Employees Who Left (3 Clusters)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# 3.3 Cluster Analysis
print("\nCluster Analysis for Employees Who Left:")
cluster_stats = left_employees.groupby('cluster').agg({
    'satisfaction_level': ['mean', 'std'],
    'last_evaluation': ['mean', 'std']
}).round(3)
print(cluster_stats)

print("\nCluster Interpretations:")
for i in range(3):
    cluster_data = left_employees[left_employees['cluster'] == i]
    avg_satisfaction = cluster_data['satisfaction_level'].mean()
    avg_evaluation = cluster_data['last_evaluation'].mean()
    print(f"\nCluster {i} ({len(cluster_data)} employees):")
    print(f"  - Average Satisfaction: {avg_satisfaction:.3f}")
    print(f"  - Average Evaluation: {avg_evaluation:.3f}")

    if avg_satisfaction < 0.5 and avg_evaluation < 0.6:
        print("  - Profile: Low performers with low satisfaction")
    elif avg_satisfaction < 0.5 and avg_evaluation > 0.7:
        print("  - Profile: High performers with low satisfaction (burned out)")
    else:
        print("  - Profile: Mixed performance and satisfaction levels")


# STEP 4: DATA PREPROCESSING AND CLASS IMBALANCE HANDLING

In [None]:
print("\n" + "="*50)
print("STEP 4: DATA PREPROCESSING AND SMOTE")
print("="*50)

# 4.1 Preprocessing - Convert categorical to numerical
df_processed = df.copy()

# First, let's check the actual column names
print("Actual column names in dataset:")
print(df_processed.columns.tolist())

# Identify categorical and numerical columns based on actual data
categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

# Apply get_dummies to categorical variables
if len(categorical_cols) > 0:
    df_categorical = pd.get_dummies(df_processed[categorical_cols], prefix=categorical_cols)
    df_numerical = df_processed[numerical_cols]
else:
    # If no categorical columns detected, check for specific columns that should be categorical
    potential_categorical = []
    for col in df_processed.columns:
        if col in ['sales', 'department', 'Department', 'salary'] or df_processed[col].dtype == 'object':
            potential_categorical.append(col)

    if potential_categorical:
        print(f"Found potential categorical columns: {potential_categorical}")
        df_categorical = pd.get_dummies(df_processed[potential_categorical], prefix=potential_categorical)
        df_numerical = df_processed[[col for col in df_processed.columns if col not in potential_categorical]]
    else:
        print("No categorical columns found. Using all columns as numerical.")
        df_categorical = pd.DataFrame()  # Empty dataframe
        df_numerical = df_processed

df_final = pd.concat([df_numerical, df_categorical], axis=1)
else:
    df_final = df_numerical.copy()
print(f"\nFinal # Combine categorical and numerical variables
if not df_categorical.empty:
    dataset shape after preprocessing: {df_final.shape}")
print(f"Final columns: {df_final.columns.tolist()}")

# Prepare features and target
X = df_final.drop('left', axis=1)
y = df_final['left']

print(f"\nClass distribution before SMOTE:")
print(f"Class 0 (Stayed): {sum(y == 0)}")
print(f"Class 1 (Left): {sum(y == 1)}")

# 4.2 Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123, stratify=y
)

print(f"\nTrain set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# 4.3 Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"\nClass distribution after SMOTE:")
print(f"Class 0 (Stayed): {sum(y_train_smote == 0)}")
print(f"Class 1 (Left): {sum(y_train_smote == 1)}")


# STEP 5: MODEL TRAINING AND CROSS-VALIDATION

In [None]:
print("\n" + "="*50)
print("STEP 5: MODEL TRAINING AND CROSS-VALIDATION")
print("="*50)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100)
}

# 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Fit model on SMOTE data
    model.fit(X_train_smote, y_train_smote)

    # Cross-validation scores
    cv_scores = cross_val_score(model, X_train_smote, y_train_smote, cv=cv, scoring='accuracy')
    cv_results[name] = cv_scores

    print(f"{name} CV Scores: {cv_scores}")
    print(f"{name} Mean CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

    # Predictions on test set
    y_pred = model.predict(X_test)

    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))

# STEP 6: MODEL EVALUATION AND BEST MODEL SELECTION

In [None]:
print("\n" + "="*50)
print("STEP 6: MODEL EVALUATION")
print("="*50)

# Train models and get predictions
model_results = {}
trained_models = {}

for name, model in models.items():
    # Train model
    model.fit(X_train_smote, y_train_smote)
    trained_models[name] = model

    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    auc_score = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)

    model_results[name] = {
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'auc_score': auc_score,
        'confusion_matrix': conf_matrix
    }

# 6.1 ROC Curves
plt.figure(figsize=(12, 8))
for name, results in model_results.items():
    fpr, tpr, _ = roc_curve(y_test, results['probabilities'])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {results["auc_score"]:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Models')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# 6.2 Confusion Matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for i, (name, results) in enumerate(model_results.items()):
    sns.heatmap(results['confusion_matrix'], annot=True, fmt='d',
                cmap='Blues', ax=axes[i])
    axes[i].set_title(f'{name} - Confusion Matrix')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')
plt.tight_layout()
plt.show()

# Print detailed metrics
print("\nDetailed Model Performance:")
print("-" * 60)
for name, results in model_results.items():
    tn, fp, fn, tp = results['confusion_matrix'].ravel()
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"\n{name}:")
    print(f"  AUC Score: {results['auc_score']:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")

# 6.3 best Model SelectionB
best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['auc_score'])
best_model = trained_models[best_model_name]
best_auc = model_results[best_model_name]['auc_score']

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"   AUC Score: {best_auc:.4f}")

print(f"\n6.3 Evaluation Metric Justification:")
print("For employee turnover prediction, RECALL is more important than Precision because:")
print("- We want to identify as many employees likely to leave as possible (minimize false negatives)")
print("- Missing a potential departure (false negative) is costlier than false alarms (false positive)")
print("- HR can afford to implement retention strategies for some false positives")
print("- The cost of employee turnover is much higher than the cost of unnecessary retention efforts")


# STEP 7: RETENTION STRATEGIES

In [None]:
print("\n" + "="*50)
print("STEP 7: RETENTION STRATEGIES")
print("="*50)

# 7.1 Predict probabilities for test set using best model
test_probabilities = model_results[best_model_name]['probabilities']

# 7.2 Categorize employees into risk zones
def categorize_risk(prob):
    if prob < 0.2:
        return 'Safe Zone (Green)'
    elif prob < 0.6:
        return 'Low-Risk Zone (Yellow)'
    elif prob < 0.9:
        return 'Medium-Risk Zone (Orange)'
    else:
        return 'High-Risk Zone (Red)'

# Create results dataframe
results_df = pd.DataFrame({
    'Employee_ID': range(len(y_test)),
    'Actual_Left': y_test.values,
    'Predicted_Probability': test_probabilities,
    'Risk_Zone': [categorize_risk(p) for p in test_probabilities]
})

# Count employees in each zone
zone_counts = results_df['Risk_Zone'].value_counts()
print("Employee Distribution by Risk Zone:")
print(zone_counts)

# Visualization of risk zones
plt.figure(figsize=(12, 8))
zone_colors = {'Safe Zone (Green)': 'green', 'Low-Risk Zone (Yellow)': 'yellow',
               'Medium-Risk Zone (Orange)': 'orange', 'High-Risk Zone (Red)': 'red'}

for zone in zone_counts.index:
    zone_data = results_df[results_df['Risk_Zone'] == zone]
    plt.scatter(range(len(zone_data)), zone_data['Predicted_Probability'],
                c=zone_colors[zone], label=zone, alpha=0.6)

plt.axhline(y=0.2, color='green', linestyle='--', alpha=0.5)
plt.axhline(y=0.6, color='yellow', linestyle='--', alpha=0.5)
plt.axhline(y=0.9, color='orange', linestyle='--', alpha=0.5)
plt.xlabel('Employee Index')
plt.ylabel('Turnover Probability')
plt.title('Employee Risk Zone Classification')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Retention Strategies
print("\n" + "="*60)
print("RETENTION STRATEGIES BY RISK ZONE")
print("="*60)

strategies = {
    'Safe Zone (Green)': [
        "✅ Maintain current engagement levels",
        "✅ Regular check-ins to ensure continued satisfaction",
        "✅ Recognition and appreciation programs",
        "✅ Career development discussions during annual reviews"
    ],

    'Low-Risk Zone (Yellow)': [
        "⚠️ Quarterly one-on-one meetings with managers",
        "⚠️ Skills development and training opportunities",
        "⚠️ Flexible work arrangements consideration",
        "⚠️ Team building activities and social events",
        "⚠️ Monitor workload and project assignments"
    ],

    'Medium-Risk Zone (Orange)': [
        "🔶 Immediate manager intervention and support",
        "🔶 Comprehensive career planning sessions",
        "🔶 Salary and benefits review",
        "🔶 Mentorship program enrollment",
        "🔶 Workload redistribution if overworked",
        "🔶 Consider role changes or lateral moves",
        "🔶 Monthly satisfaction surveys"
    ],

    'High-Risk Zone (Red)': [
        "🚨 URGENT: Senior management involvement",
        "🚨 Exit interview preparation to understand concerns",
        "🚨 Immediate salary and benefits adjustment consideration",
        "🚨 Special project assignments to re-engage",
        "🚨 Fast-track promotion evaluation",
        "🚨 Personal development plan with clear milestones",
        "🚨 Weekly check-ins with HR and management",
        "🚨 Counter-offer preparation if applicable"
    ]
}

for zone, strategy_list in strategies.items():
    count = zone_counts.get(zone, 0)
    print(f"\n{zone} - {count} employees")
    print("-" * 40)
    for strategy in strategy_list:
        print(f"  {strategy}")

# Feature importance (for Random Forest or Gradient Boosting)
if best_model_name in ['Random Forest', 'Gradient Boosting']:
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print(f"\n" + "="*50)
    print(f"FEATURE IMPORTANCE ({best_model_name})")
    print("="*50)
    print(feature_importance.head(10))

    # Plot feature importance
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(10)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 10 Feature Importances - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

print("\n" + "="*60)
print("PROJECT SUMMARY")
print("="*60)
print(f"✅ Dataset processed: {df.shape[0]} employees")
print(f"✅ No missing values detected")
print(f"✅ Class imbalance handled using SMOTE")
print(f"✅ 3 models trained and evaluated with 5-fold CV")
print(f"✅ Best model: {best_model_name} (AUC: {best_auc:.4f})")
print(f"✅ {len(results_df)} employees categorized into risk zones")
print(f"✅ Retention strategies provided for all risk categories")
print("\n🎯 Ready for deployment and employee retention implementation!")