<a href="https://colab.research.google.com/github/sathsara1/emplyee-retain-period-analyzer/blob/random-forests/randomForests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
import mlcroissant as mlc


warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [24]:
def load_and_explore_data():

    try:
        croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/stealthtechnologies/employee-attrition-dataset/croissant/download')

        record_sets = croissant_dataset.metadata.record_sets

        record_set_df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
        df = record_set_df

    except Exception as e:
        print(f"Error loading dataset from Kaggle: {str(e)}")
        return None # Return None if there's an error

    print("\n First 5 rows of the dataset:")
    print(df.head())

    print("\n Missing values per column:")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])

    print(df.describe())

    return df

In [None]:
def preprocess_data(df):
    df_processed = df.copy()

    columns_to_drop = ['EmployeeNumber', 'Over18', 'StandardHours', 'EmployeeCount']
    existing_columns_to_drop = [col for col in columns_to_drop if col in df_processed.columns]
    if existing_columns_to_drop:
        df_processed = df_processed.drop(existing_columns_to_drop, axis=1)

    if df_processed.isnull().sum().sum() > 0:
        numerical_cols = df_processed.select_dtypes(include=[np.number]).columns
        df_processed[numerical_cols] = df_processed[numerical_cols].fillna(df_processed[numerical_cols].median())

        categorical_cols = df_processed.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])

    label_encoders = {}
    categorical_columns = df_processed.select_dtypes(include=['object']).columns

    for column in categorical_columns:
        le = LabelEncoder()
        df_processed[column] = le.fit_transform(df_processed[column])
        label_encoders[column] = le

    return df_processed, label_encoders


In [26]:
def create_longevity_target(df):

    df_with_target = df.copy()


    longevity_score = np.zeros(len(df_with_target))

    if 'test.csv/Job+Satisfaction' in df_with_target.columns:
        longevity_score += df_with_target['test.csv/Job+Satisfaction'] * 0.3
    elif 'JobSatisfaction' in df_with_target.columns:
        longevity_score += df_with_target['JobSatisfaction'] * 0.3

    if 'test.csv/Work-Life+Balance' in df_with_target.columns:
        longevity_score += df_with_target['test.csv/Work-Life+Balance'] * 0.2
    elif 'WorkLifeBalance' in df_with_target.columns:
        longevity_score += df_with_target['WorkLifeBalance'] * 0.2

    if 'test.csv/Performance+Rating' in df_with_target.columns:
        longevity_score += df_with_target['test.csv/Performance+Rating'] * 0.1
    elif 'PerformanceRating' in df_with_target.columns:
        longevity_score += df_with_target['PerformanceRating'] * 0.1

    if 'test.csv/Years+at+Company' in df_with_target.columns:
        longevity_score += df_with_target['test.csv/Years+at+Company'] * 0.1
    elif 'YearsAtCompany' in df_with_target.columns:
        longevity_score += df_with_target['YearsAtCompany'] * 0.1

    if 'test.csv/Company+Tenure' in df_with_target.columns:
        longevity_score += df_with_target['test.csv/Company+Tenure'] * 0.1

    if 'test.csv/Number+of+Promotions' in df_with_target.columns:
        longevity_score += df_with_target['test.csv/Number+of+Promotions'] * 0.1

    if 'test.csv/Attrition' in df_with_target.columns:
        attrition_binary = (df_with_target['test.csv/Attrition'] == 1).astype(int)
        longevity_score += (1 - attrition_binary) * 0.3
    elif 'Attrition' in df_with_target.columns:
        longevity_score += (1 - df_with_target['Attrition']) * 0.3

    longevity_score = (longevity_score - longevity_score.min()) / (longevity_score.max() - longevity_score.min()) * 10

    df_with_target['LongevityScore'] = longevity_score

    print(f"Longevity Score:")
    print(f"   Mean: {longevity_score.mean():.2f}")
    print(f"   Std: {longevity_score.std():.2f}")
    print(f"   Min: {longevity_score.min():.2f}")
    print(f"   Max: {longevity_score.max():.2f}")

    return df_with_target


In [27]:
def visualize_data(df):

    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Employee Data Analysis', fontsize=16, fontweight='bold')

    axes[0, 0].hist(df['LongevityScore'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 0].set_title('Distribution of Longevity Scores')
    axes[0, 0].set_xlabel('Longevity Score')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].grid(True, alpha=0.3)

    if 'test.csv/Job+Satisfaction' in df.columns:
        axes[0, 1].scatter(df['test.csv/Job+Satisfaction'], df['LongevityScore'], alpha=0.6, color='green')
        axes[0, 1].set_title('Job Satisfaction vs Longevity Score')
        axes[0, 1].set_xlabel('Job Satisfaction')
        axes[0, 1].set_ylabel('Longevity Score')
        axes[0, 1].grid(True, alpha=0.3)
    elif 'JobSatisfaction' in df.columns:
        axes[0, 1].scatter(df['JobSatisfaction'], df['LongevityScore'], alpha=0.6, color='green')
        axes[0, 1].set_title('Job Satisfaction vs Longevity Score')
        axes[0, 1].set_xlabel('Job Satisfaction')
        axes[0, 1].set_ylabel('Longevity Score')
        axes[0, 1].grid(True, alpha=0.3)

    if 'test.csv/Attrition' in df.columns:
        attrition_data = df.groupby('test.csv/Attrition')['LongevityScore'].mean()
        axes[1, 0].bar(['Stayed', 'Left'], attrition_data.values, color=['lightgreen', 'lightcoral'])
        axes[1, 0].set_title('Average Longevity Score by Attrition Status')
        axes[1, 0].set_ylabel('Average Longevity Score')
        axes[1, 0].grid(True, alpha=0.3)
    elif 'Attrition' in df.columns:
        attrition_data = df.groupby('Attrition')['LongevityScore'].mean()
        axes[1, 0].bar(['Stayed', 'Left'], attrition_data.values, color=['lightgreen', 'lightcoral'])
        axes[1, 0].set_title('Average Longevity Score by Attrition Status')
        axes[1, 0].set_ylabel('Average Longevity Score')
        axes[1, 0].grid(True, alpha=0.3)

    if 'test.csv/Years+at+Company' in df.columns:
        axes[1, 1].scatter(df['test.csv/Years+at+Company'], df['LongevityScore'], alpha=0.6, color='purple')
        axes[1, 1].set_title('Years at Company vs Longevity Score')
        axes[1, 1].set_xlabel('Years at Company')
        axes[1, 1].set_ylabel('Longevity Score')
        axes[1, 1].grid(True, alpha=0.3)
    elif 'YearsAtCompany' in df.columns:
        axes[1, 1].scatter(df['YearsAtCompany'], df['LongevityScore'], alpha=0.6, color='purple')
        axes[1, 1].set_title('Years at Company vs Longevity Score')
        axes[1, 1].set_xlabel('Years at Company')
        axes[1, 1].set_ylabel('Longevity Score')
        axes[1, 1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(12, 8))
    correlation_matrix = df.select_dtypes(include=[np.number]).corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
    plt.title('Correlation Matrix of Numerical Features', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [28]:
def train_random_forest_model(df):
    X = df.drop(['LongevityScore'], axis=1)
    y = df['LongevityScore']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=None
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )

    rf_model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred_train = rf_model.predict(X_train_scaled)
    y_pred_test = rf_model.predict(X_test_scaled)

    # Calculate metrics
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)

    print(f"\nMODEL PERFORMANCE:")
    print(f"   Training MSE: {train_mse:.4f}")
    print(f"   Test MSE: {test_mse:.4f}")
    print(f"   Training R²: {train_r2:.4f}")
    print(f"   Test R²: {test_r2:.4f}")
    print(f"   Training MAE: {train_mae:.4f}")
    print(f"   Test MAE: {test_mae:.4f}")

    cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='r2')
    print(f"   Cross-validation R² scores: {cv_scores}")
    print(f"   Mean CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

    return rf_model, scaler, X_train, X_test, y_train, y_test, y_pred_test

In [16]:
def main():

    df = load_and_explore_data()
    if df is None:
        print("Upload the dataset to continue")
        return

    # df_processed, label_encoders = preprocess_data(df)

    # df_with_target = create_longevity_target(df_processed)

    # visualize_data(df_with_target)

    # # Step 5: Train Random Forest model
    # model, scaler, X_train, X_test, y_train, y_test, y_pred = train_random_forest_model(df_with_target)

    # # Step 6: Analyze feature importance
    # feature_importance_df = analyze_feature_importance(model, X_train.columns)

    # # Step 7: Evaluate model performance
    # evaluate_model_performance(y_test, y_pred)

    # # Step 8: Example prediction
    # print("\n" + "="*60)
    # print("🔮 EXAMPLE PREDICTION")
    # print("="*60)

    # # Create a sample employee for prediction
    # sample_employee = X_test.iloc[0].to_dict()
    # print("📋 Sample employee features:")
    # for key, value in sample_employee.items():
    #     print(f"   {key}: {value}")

    # predicted_longevity = predict_employee_longevity(
    #     model, scaler, X_train.columns, sample_employee
    # )

    # print(f"\n✅ Actual longevity score: {y_test.iloc[0]:.2f}")
    # print(f"✅ Predicted longevity score: {predicted_longevity:.2f}")

    # print("\n🎉 ANALYSIS COMPLETE!")
    # print("="*80)

    # return model, scaler, feature_importance_df