In [2]:
# notebooks/4_honest_analytics.ipynb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import os

# Load data
df = pd.read_csv('../data/employee_attrition.csv')
print("=== HONEST HR ANALYTICS: DESCRIPTIVE + DIAGNOSTIC ===")

# 1. DESCRIPTIVE: ATTRITION PATTERNS
print("\n🔍 DESCRIPTIVE ANALYTICS: Where is attrition happening?")
print("="*50)

# Department attrition rates
dept_attrition = df.groupby('Department').agg({
    'Attrition': lambda x: (x == 'Yes').mean() * 100,
    'EmployeeNumber': 'count'
}).round(1)
dept_attrition.columns = ['AttritionRate%', 'EmployeeCount']
print("Department Attrition Rates:")
print(dept_attrition.sort_values('AttritionRate%', ascending=False))

# Job Role analysis
role_attrition = df.groupby('JobRole').agg({
    'Attrition': lambda x: (x == 'Yes').mean() * 100,
    'MonthlyIncome': 'median'
}).round(1)
role_attrition.columns = ['AttritionRate%', 'MedianIncome']
print("\nJob Role Analysis (Top 5 Highest Attrition):")
print(role_attrition.nlargest(5, 'AttritionRate%'))

# 2. DIAGNOSTIC: WHY are people leaving?
print("\n🔎 DIAGNOSTIC ANALYTICS: Why are people leaving?")
print("="*50)

# Compare leavers vs stayers - handle numerical and categorical separately
leavers = df[df['Attrition'] == 'Yes']
stayers = df[df['Attrition'] == 'No']

print("NUMERICAL DIFFERENCES (Leavers vs Stayers):")
numerical_comparison = pd.DataFrame({
    'Leavers': leavers.select_dtypes(include=[np.number]).mean(),
    'Stayers': stayers.select_dtypes(include=[np.number]).mean()
}).round(1)

key_numerical = ['MonthlyIncome', 'JobSatisfaction', 'WorkLifeBalance', 
                 'YearsAtCompany', 'YearsSinceLastPromotion', 'StockOptionLevel']
print(numerical_comparison.loc[key_numerical])

print("\nCATEGORICAL DIFFERENCES:")
print(f"Overtime - Leavers: {(leavers['OverTime'] == 'Yes').mean()*100:.1f}% vs Stayers: {(stayers['OverTime'] == 'Yes').mean()*100:.1f}%")
print(f"Business Travel - Leavers: {leavers['BusinessTravel'].value_counts(normalize=True).iloc[0]*100:.1f}% vs Stayers: {stayers['BusinessTravel'].value_counts(normalize=True).iloc[0]*100:.1f}%")

# 3. SEGMENTATION: Employee clusters
print("\n👥 EMPLOYEE SEGMENTATION ANALYSIS")
print("="*50)

# Prepare data for clustering
cluster_features = ['MonthlyIncome', 'JobSatisfaction', 'WorkLifeBalance', 
                    'YearsAtCompany', 'PerformanceRating']
X_cluster = df[cluster_features].copy()

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

# Apply clustering
kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Analyze clusters
cluster_analysis = df.groupby('Cluster').agg({
    'MonthlyIncome': 'median',
    'JobSatisfaction': 'median', 
    'WorkLifeBalance': 'median',
    'OverTime': lambda x: (x == 'Yes').mean() * 100,
    'Attrition': lambda x: (x == 'Yes').mean() * 100,
    'EmployeeNumber': 'count'
}).round(1)

cluster_analysis.columns = ['MedIncome', 'MedJobSat', 'MedWorkLife', 'Overtime%', 'Attrition%', 'Count']
print("Employee Segments:")
print(cluster_analysis.sort_values('Attrition%', ascending=False))

# 4. ACTIONABLE INSIGHTS
print("\n🎯 ACTIONABLE BUSINESS INSIGHTS")
print("="*50)

# Insight 1: Compensation gap
income_gap = leavers['MonthlyIncome'].median() - stayers['MonthlyIncome'].median()
print(f"🚨 COMPENSATION CRISIS: Leavers earn ${abs(income_gap):.0f} LESS than stayers")

# Insight 2: Overtime impact  
overtime_attrition = df.groupby('OverTime')['Attrition'].apply(
    lambda x: (x == 'Yes').mean() * 100
).round(1)
print(f"🔥 OVERTIME BURNOUT: {overtime_attrition['Yes']}% of overtime workers leave vs {overtime_attrition['No']}%")

# Insight 3: Promotion stagnation
promotion_gap = leavers['YearsSinceLastPromotion'].median() - stayers['YearsSinceLastPromotion'].median()
print(f"📈 CAREER STAGNATION: Leavers wait {promotion_gap:.1f} years longer for promotions")

# Insight 4: High-risk segments
high_risk_cluster = cluster_analysis['Attrition%'].idxmax()
high_risk_attrs = cluster_analysis.loc[high_risk_cluster]
print(f"👥 HIGH-RISK SEGMENT: Cluster {high_risk_cluster} has {high_risk_attrs['Attrition%']}% attrition")

# Create results directory and save
os.makedirs('../results', exist_ok=True)
insights_df = df[['EmployeeNumber', 'Department', 'JobRole', 'MonthlyIncome', 
                  'OverTime', 'JobSatisfaction', 'WorkLifeBalance', 'Cluster', 'Attrition']].copy()
insights_df.to_csv('../results/hr_analytics_insights.csv', index=False)


=== HONEST HR ANALYTICS: DESCRIPTIVE + DIAGNOSTIC ===

🔍 DESCRIPTIVE ANALYTICS: Where is attrition happening?
Department Attrition Rates:
                        AttritionRate%  EmployeeCount
Department                                           
Sales                             20.6            446
Human Resources                   19.0             63
Research & Development            13.8            961

Job Role Analysis (Top 5 Highest Attrition):
                       AttritionRate%  MedianIncome
JobRole                                            
Sales Representative             39.8        2579.0
Laboratory Technician            23.9        2886.0
Human Resources                  23.1        3093.0
Sales Executive                  17.5        6231.0
Research Scientist               16.1        2887.5

🔎 DIAGNOSTIC ANALYTICS: Why are people leaving?
NUMERICAL DIFFERENCES (Leavers vs Stayers):
                         Leavers  Stayers
MonthlyIncome             4787.1   6832.7
JobSa



Employee Segments:
         MedIncome  MedJobSat  MedWorkLife  Overtime%  Attrition%  Count
Cluster                                                                 
2           4385.0        1.0          3.0       25.6        20.3    403
3           4581.0        3.0          3.0       28.8        17.2    215
1           4382.0        4.0          3.0       29.9        15.7    645
0          16437.0        3.0          3.0       28.0         8.2    207

🎯 ACTIONABLE BUSINESS INSIGHTS
🚨 COMPENSATION CRISIS: Leavers earn $2002 LESS than stayers
🔥 OVERTIME BURNOUT: 30.5% of overtime workers leave vs 10.4%
📈 CAREER STAGNATION: Leavers wait 0.0 years longer for promotions
👥 HIGH-RISK SEGMENT: Cluster 2 has 20.3% attrition


In [5]:
# === FIXED: SAVE INSIGHTS FOR FLASK DASHBOARD ===
print("\n💾 SAVING INSIGHTS FOR FLASK DASHBOARD...")

import pickle
import os

# Create results directory if it doesn't exist
os.makedirs('../results', exist_ok=True)

# FIXED Department analysis - preserve department names
dept_attrition = df.groupby('Department').agg({
    'Attrition': lambda x: (x == 'Yes').mean() * 100,
    'EmployeeNumber': 'count'
}).round(1)
dept_attrition.columns = ['AttritionRate%', 'EmployeeCount']
dept_attrition = dept_attrition.reset_index()  # ✅ FIX: Keep Department as column

print("FIXED dept_attrition columns:", dept_attrition.columns.tolist())

# Prepare insights dictionary
hr_insights = {
    'key_metrics': {
        'total_employees': len(df),
        'attrition_rate': (df['Attrition'] == 'Yes').mean() * 100,
        'avg_income': df['MonthlyIncome'].mean(),
        'overtime_rate': (df['OverTime'] == 'Yes').mean() * 100,
        'salary_gap': abs(leavers['MonthlyIncome'].median() - stayers['MonthlyIncome'].median())
    },
    'department_analysis': dept_attrition.to_dict('records'),
    'high_risk_roles': role_attrition.nlargest(5, 'AttritionRate%').to_dict('records'),
    'employee_clusters': cluster_analysis.reset_index().to_dict('records'),
    'overtime_impact': {
        'overtime_yes': overtime_attrition['Yes'],
        'overtime_no': overtime_attrition['No']
    }
}

# Save insights
with open('../results/hr_insights.pkl', 'wb') as f:
    pickle.dump(hr_insights, f)

print("✅ FIXED Insights saved to 'results/hr_insights.pkl'")


💾 SAVING INSIGHTS FOR FLASK DASHBOARD...
FIXED dept_attrition columns: ['Department', 'AttritionRate%', 'EmployeeCount']
✅ FIXED Insights saved to 'results/hr_insights.pkl'
