In [267]:
# Import additional libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, mean_absolute_error
import seaborn as sns
import json
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LogisticRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, r2_score, mean_squared_error, precision_score, recall_score
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import joblib
from datetime import datetime
import logging
import os
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from xgboost import XGBRegressor

In [268]:
def tune_xgboost_params(X_train, y_train):
    """Tune XGBoost hyperparameters with reduced parameter grid"""
    param_grid = {
        'max_depth': [4, 5],           # Reduced options
        'learning_rate': [0.05, 0.1],  # Removed slowest option
        'n_estimators': [200],         # Fixed value
        'min_child_weight': [3],       # Fixed value
        'subsample': [0.9],            # Fixed value
        'colsample_bytree': [0.9],     # Fixed value
        'gamma': [0.1],                # Fixed value
    }
    
    xgb = XGBRegressor(
        random_state=42,
        n_jobs=-1,  # Parallel processing
        tree_method='hist'  # Faster histogram-based algorithm
    )
    
    grid_search = GridSearchCV(
        estimator=xgb,
        param_grid=param_grid,
        cv=3,  # Reduced from 5 to 3
        scoring='r2',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

In [269]:
# Load the dataset
def load_data(filepath):
    logging.info("Loading dataset from %s", filepath)
    if not os.path.exists(filepath):
        logging.error("Dataset file not found: %s", filepath)
        raise FileNotFoundError(f"Dataset file not found: {filepath}")

    df = pd.read_csv(filepath)

    required_columns = ["PerformanceRating", "MonthlyIncome", "EmployeeNumber", "Attrition", "JobSatisfaction", "YearsAtCompany", "YearsSinceLastPromotion"]
    for col in required_columns:
        if col not in df.columns:
            logging.error("Missing required column: %s", col)
            raise KeyError(f"Missing required column: {col}")

    if (df["MonthlyIncome"] <= 0).any():
        logging.error("MonthlyIncome contains non-positive values")
        raise ValueError("MonthlyIncome contains non-positive values")
    if (df["YearsAtCompany"] < 0).any():
        logging.error("YearsAtCompany contains negative values")
        raise ValueError("YearsAtCompany contains negative values")
    if (df["YearsSinceLastPromotion"] < 0).any():
        logging.error("YearsSinceLastPromotion contains negative values")
        raise ValueError("YearsSinceLastPromotion contains negative values")
    if not df["Attrition"].isin(["Yes", "No"]).all():
        logging.error("Attrition contains invalid values")
        raise ValueError("Attrition contains values other than 'Yes' or 'No'")

    logging.info("Dataset loaded successfully with %d rows and %d columns", len(df), len(df.columns))
    return df

# Update the file path to the absolute path of your dataset
filepath = r"c:\Users\Srimanth L\OneDrive\Desktop\C Programs\WA_Fn-UseC_-HR-Employee-Attrition.csv"

# Load the dataset
df = load_data(filepath)
df.head()  # Display the first few rows of the dataset

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [270]:
# Preprocess the data
def preprocess_data(df):
    logging.info("Preprocessing data")
    label_encoders = {}
    
    for column in df.select_dtypes(include=["object"]).columns:
        if column != "Attrition":
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column])
            label_encoders[column] = le
            logging.info("Encoded categorical column: %s", column)

    df["Attrition"] = df["Attrition"].map({"Yes": 1, "No": 0})

    imputer = KNNImputer(n_neighbors=5)
    numeric_cols = df.select_dtypes(include=np.number).columns
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    logging.info("Imputed missing values using KNNImputer")
    
    return df, label_encoders

# Preprocess the data
df, encoders = preprocess_data(df)
df.head()  # Display the first few rows of the preprocessed dataset

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41.0,1.0,2.0,1102.0,2.0,1.0,2.0,1.0,1.0,1.0,...,1.0,80.0,0.0,8.0,0.0,1.0,6.0,4.0,0.0,5.0
1,49.0,0.0,1.0,279.0,1.0,8.0,1.0,1.0,1.0,2.0,...,4.0,80.0,1.0,10.0,3.0,3.0,10.0,7.0,1.0,7.0
2,37.0,1.0,2.0,1373.0,1.0,2.0,2.0,4.0,1.0,4.0,...,2.0,80.0,0.0,7.0,3.0,3.0,0.0,0.0,0.0,0.0
3,33.0,0.0,1.0,1392.0,1.0,3.0,4.0,1.0,1.0,5.0,...,3.0,80.0,0.0,8.0,3.0,3.0,8.0,7.0,3.0,0.0
4,27.0,0.0,2.0,591.0,1.0,2.0,1.0,3.0,1.0,7.0,...,4.0,80.0,1.0,6.0,3.0,3.0,2.0,2.0,2.0,2.0


In [271]:
# Perform feature engineering
def feature_engineering(df):
    logging.info("Performing feature engineering")
    df["Satisfaction_Weighted_Salary"] = df["MonthlyIncome"] * (df["JobSatisfaction"] / 4)
    df["YearsSincePromotion"] = np.maximum(df["YearsAtCompany"] - df["YearsSinceLastPromotion"], 0)
    logging.info("Created features: Satisfaction_Weighted_Salary, YearsSincePromotion")
    return df

# Perform feature engineering
df = feature_engineering(df)
df.head()  # Display the first few rows of the dataset after feature engineering

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Satisfaction_Weighted_Salary,YearsSincePromotion
0,41.0,1.0,2.0,1102.0,2.0,1.0,2.0,1.0,1.0,1.0,...,0.0,8.0,0.0,1.0,6.0,4.0,0.0,5.0,5993.0,6.0
1,49.0,0.0,1.0,279.0,1.0,8.0,1.0,1.0,1.0,2.0,...,1.0,10.0,3.0,3.0,10.0,7.0,1.0,7.0,2565.0,9.0
2,37.0,1.0,2.0,1373.0,1.0,2.0,2.0,4.0,1.0,4.0,...,0.0,7.0,3.0,3.0,0.0,0.0,0.0,0.0,1567.5,0.0
3,33.0,0.0,1.0,1392.0,1.0,3.0,4.0,1.0,1.0,5.0,...,0.0,8.0,3.0,3.0,8.0,7.0,3.0,0.0,2181.75,5.0
4,27.0,0.0,2.0,591.0,1.0,2.0,1.0,3.0,1.0,7.0,...,1.0,6.0,3.0,3.0,2.0,2.0,2.0,2.0,1734.0,0.0


In [272]:
def advanced_feature_engineering(df):
    logging.info("Performing advanced feature engineering")
    
    # Create more informative features
    df["TenurePerf"] = df["YearsAtCompany"] * df["PerformanceRating"]
    df["IncomeAgeRatio"] = df["MonthlyIncome"] / df["Age"]
    df["ExperienceLevel"] = df["TotalWorkingYears"] / df["Age"]
    df["CareerProgress"] = df["JobLevel"] / df["TotalWorkingYears"].replace(0, 1)
    df["SalaryPerLevel"] = df["MonthlyIncome"] / (df["JobLevel"].replace(0, 1))
    df["PromotionRate"] = df["YearsSinceLastPromotion"] / df["YearsAtCompany"].replace(0, 1)
    
    # Select important features
    features = [
        "JobLevel", "PerformanceRating", "YearsAtCompany", "TotalWorkingYears",
        "TenurePerf", "IncomeAgeRatio", "ExperienceLevel", "CareerProgress",
        "SalaryPerLevel", "PromotionRate", "Education", "Age", "MonthlyIncome",
        "JobSatisfaction", "WorkLifeBalance", "EnvironmentSatisfaction"
    ]
    
    # Ensure features exist
    existing_features = [f for f in features if f in df.columns]
    logging.info(f"Using features: {existing_features}")
    
    return df, existing_features

In [273]:
# Simulate future salaries
def simulate_future_salaries(df):
    logging.info("Simulating future salaries")
    df["Increment"] = df["PerformanceRating"].apply(lambda x: 1.10 if x == 4 else (1.05 if x > 0 else 1.0))
    df["FutureSalary"] = df["MonthlyIncome"] * df["Increment"]
    logging.info("Calculated future salaries with increments")
    return df

# Simulate future salaries
df = simulate_future_salaries(df)
df.head()  # Display the first few rows of the dataset with future salaries

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Satisfaction_Weighted_Salary,YearsSincePromotion,Increment,FutureSalary
0,41.0,1.0,2.0,1102.0,2.0,1.0,2.0,1.0,1.0,1.0,...,0.0,1.0,6.0,4.0,0.0,5.0,5993.0,6.0,1.05,6292.65
1,49.0,0.0,1.0,279.0,1.0,8.0,1.0,1.0,1.0,2.0,...,3.0,3.0,10.0,7.0,1.0,7.0,2565.0,9.0,1.1,5643.0
2,37.0,1.0,2.0,1373.0,1.0,2.0,2.0,4.0,1.0,4.0,...,3.0,3.0,0.0,0.0,0.0,0.0,1567.5,0.0,1.05,2194.5
3,33.0,0.0,1.0,1392.0,1.0,3.0,4.0,1.0,1.0,5.0,...,3.0,3.0,8.0,7.0,3.0,0.0,2181.75,5.0,1.05,3054.45
4,27.0,0.0,2.0,591.0,1.0,2.0,1.0,3.0,1.0,7.0,...,3.0,3.0,2.0,2.0,2.0,2.0,1734.0,0.0,1.05,3641.4


In [274]:
# Handle class imbalance and improve model performance
def enhance_models(df):
    logging.info("Enhancing models with advanced techniques")
    
    # Add safety check for required columns
    required_columns = ["Attrition", "EmployeeNumber", "FutureSalary"]
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    # Create interaction features
    logging.info("Creating interaction features")
    df["Performance_Experience"] = df["PerformanceRating"] * df["YearsAtCompany"]
    df["Satisfaction_Experience"] = df["JobSatisfaction"] * df["YearsAtCompany"]
    df["Performance_Satisfaction"] = df["PerformanceRating"] * df["JobSatisfaction"]
    
    # Rest of enhance_models function remains the same
    # ...existing code...

In [275]:
# Perform classification using Logistic Regression
def classify_attrition(df):
    logging.info("Starting attrition classification")
    
    # Prepare features (X) and target (y)
    X = df.drop(["Attrition", "EmployeeNumber"], axis=1)
    y = df["Attrition"]

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    logging.info("Split data: %d train, %d test samples", len(X_train), len(X_test))

    # Define hyperparameter grid for Logistic Regression
    param_grid = {
        'C': [0.01, 0.1, 1, 10],  # Regularization strength
        'penalty': ['l1', 'l2'],  # Regularization type
        'solver': ['liblinear']   # Solver for small datasets
    }

    # Initialize Logistic Regression model
    model = LogisticRegression(random_state=42, max_iter=1000)

    # Perform Grid Search with cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best model from Grid Search
    model = grid_search.best_estimator_
    logging.info("Best hyperparameters: %s", grid_search.best_params_)

    # Perform cross-validation to evaluate the model
    cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='f1')
    logging.info("Cross-validation F1 scores: %s (mean: %.3f)", cv_scores, cv_scores.mean())

    # Evaluate the model on the test set
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_scaled)[:, 1]  # Get probabilities for the positive class

    # Log classification metrics
    logging.info("Classification Metrics:")
    logging.info("F1 Score: %.3f", f1_score(y_test, y_pred))
    logging.info("Precision: %.3f", precision_score(y_test, y_pred))
    logging.info("Recall: %.3f", recall_score(y_test, y_pred))
    logging.info("AUC-ROC: %.3f", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

    # Extract feature importance (coefficients) from Logistic Regression
    importances = np.abs(model.coef_[0])  # Absolute value of coefficients
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': importances})
    top_features = feature_importance.sort_values('importance', ascending=False).head(10)['feature'].tolist()
    logging.info("Top 10 features by importance: %s", top_features)

    # Return the trained model, scaler, and other outputs
    return model, scaler, X_scaled, y_proba, top_features

In [276]:
def regress_future_salary(df, X_cls, y_proba_cls, top_features):
    """Optimized future salary regression"""
    logging.info("Starting future salary regression")
    
    # Create only essential polynomial features
    numeric_cols = ["MonthlyIncome", "YearsAtCompany"]  # Reduced feature set
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_features = poly.fit_transform(df[numeric_cols])
    
    # Simplified feature names
    feature_names = [f"poly_{i}" for i in range(poly_features.shape[1])]
    df_poly = pd.DataFrame(
        poly_features,
        columns=feature_names,
        index=df.index
    )
    
    # More efficient feature creation
    df = pd.concat([df, df_poly], axis=1)
    df["LogMonthlyIncome"] = np.log1p(df["MonthlyIncome"])
    df["LogFutureSalary"] = np.log1p(df["FutureSalary"])
    df["Performance_Tenure"] = df["PerformanceRating"] * df["YearsAtCompany"]
    df["Income_Level"] = df["MonthlyIncome"] / np.maximum(df["JobLevel"], 1)
    
    # Filter data more efficiently
    df["P_Stay"] = 1 - y_proba_cls
    likely_to_stay = df[df["P_Stay"] > 0.7]
    
    # Reduced feature set
    selected_features = [
        "LogMonthlyIncome", "JobLevel", "YearsAtCompany",
        "PerformanceRating", "Income_Level", "Performance_Tenure"
    ] + [col for col in feature_names if col in df.columns]
    
    # Prepare features and target
    X = likely_to_stay[selected_features]
    y = likely_to_stay["LogFutureSalary"]
    
    # Faster outlier removal using percentiles
    q1, q3 = np.percentile(y, [25, 75])
    iqr = q3 - q1
    mask = (y >= q1 - 1.5 * iqr) & (y <= q3 + 1.5 * iqr)
    X = X[mask]
    y = y[mask]
    
    # Scale features
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split with smaller test size
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.1, random_state=42
    )
    
    # Get best XGBoost model
    best_xgb = tune_xgboost_params(X_train, y_train)
    
    # Simplified stacking with fewer estimators
    estimators = [
        ('xgb', best_xgb),
        ('gbr', GradientBoostingRegressor(
            n_estimators=100,
            random_state=42,
            learning_rate=0.1
        ))
    ]
    
    # Create and train stacking regressor
    reg = StackingRegressor(
        estimators=estimators,
        final_estimator=RidgeCV(),
        cv=3  # Reduced from 5
    )
    
    reg.fit(X_train, y_train)
    
    # Quick evaluation
    y_pred = reg.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    logging.info(f"Regression Metrics - R² Score: {r2:.3f}, RMSE: {rmse:.2f}")
    
    # Simplified feature importance
    feature_importance = pd.DataFrame({
        'feature': selected_features,
        'importance': abs(best_xgb.feature_importances_)
    })
    
    return df, reg, scaler, selected_features, feature_importance

In [277]:
# Visualize feature importance for Logistic Regression
def visualize_feature_importance(model, feature_names):
    logging.info("Visualizing feature importance")
    
    # Use coefficients for Logistic Regression
    importances = np.abs(model.coef_[0])  # Absolute value of coefficients
    if len(importances) != len(feature_names):
        logging.warning("Mismatch in feature importance and feature names: %d vs %d", len(importances), len(feature_names))
        min_len = min(len(importances), len(feature_names))
        importances = importances[:min_len]
        feature_names = feature_names[:min_len]

    # Sort feature importances
    indices = np.argsort(importances)[::-1]
    plt.figure(figsize=(10, 6))
    plt.title("Feature Importance (Logistic Regression)")
    plt.bar(range(len(feature_names)), importances[indices], align="center")
    plt.xticks(range(len(feature_names)), np.array(feature_names)[indices], rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig("feature_importance_plot.png")
    plt.close()
    logging.info("Feature importance plot saved to feature_importance_plot.png")

# Ensure the classification model is defined
clf, scaler_cls, X_cls, y_proba_cls, top_features = classify_attrition(df)

# Call the function to visualize feature importance
visualize_feature_importance(clf, df.drop(["Attrition", "EmployeeNumber"], axis=1).columns)

In [278]:
# Create regression model and get results
df, best_reg, scaler_reg, features_reg, feature_importance = regress_future_salary(df, X_cls, y_proba_cls, top_features)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [279]:
# Error Analysis for Classification and Regression Models
def perform_error_analysis(clf, best_reg, X_cls, y_cls, X_reg, y_reg, scaler_cls, scaler_reg):
    logging.info("Performing detailed error analysis")
    
    # Classification Error Analysis
    logging.info("\nClassification Model Analysis:")
    X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)
    
    # Training metrics
    y_train_pred = clf.predict(X_train_cls)
    train_accuracy = accuracy_score(y_train_cls, y_train_pred)
    train_f1 = f1_score(y_train_cls, y_train_pred)
    
    # Testing metrics
    y_test_pred = clf.predict(X_test_cls)
    test_accuracy = accuracy_score(y_test_cls, y_test_pred)
    test_f1 = f1_score(y_test_cls, y_test_pred)
    
    print("\nClassification Metrics:")
    print(f"Training - Accuracy: {train_accuracy:.3f}, F1 Score: {train_f1:.3f}")
    print(f"Testing  - Accuracy: {test_accuracy:.3f}, F1 Score: {test_f1:.3f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test_cls, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - Classification')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'confusion_matrix_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
    plt.close()
    
    # ROC Curve
    y_test_proba = clf.predict_proba(X_test_cls)[:, 1]
    fpr, tpr, _ = roc_curve(y_test_cls, y_test_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Classification')
    plt.legend(loc="lower right")
    plt.savefig(f'roc_curve_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
    plt.close()
    
     
    # Regression Error Analysis
    logging.info("\nRegression Model Analysis:")
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.15, random_state=42)
    
    # Training metrics on log scale
    y_train_pred_reg = best_reg.predict(X_train_reg)
    train_r2 = r2_score(y_train_reg, y_train_pred_reg)
    train_rmse = np.sqrt(mean_squared_error(y_train_reg, y_train_pred_reg))
    train_mae = mean_absolute_error(y_train_reg, y_train_pred_reg)
    
    # Testing metrics on log scale
    y_test_pred_reg = best_reg.predict(X_test_reg)
    test_r2 = r2_score(y_test_reg, y_test_pred_reg)
    test_rmse = np.sqrt(mean_squared_error(y_test_reg, y_test_pred_reg))
    test_mae = mean_absolute_error(y_test_reg, y_test_pred_reg)
    
    # Print metrics
    print("\nRegression Metrics (log scale):")
    print(f"Training - R² Score: {train_r2:.3f}, RMSE: {train_rmse:.3f}, MAE: {train_mae:.3f}")
    print(f"Testing  - R² Score: {test_r2:.3f}, RMSE: {test_rmse:.3f}, MAE: {test_mae:.3f}")
    
    # Residual Plot
    plt.figure(figsize=(10, 6))
    residuals = y_test_reg - y_test_pred_reg
    plt.scatter(y_test_pred_reg, residuals)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Plot - Regression')
    plt.savefig(f'residual_plot_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
    plt.close()
    
    # Actual vs Predicted Plot
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test_reg, y_test_pred_reg)
    plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Actual vs Predicted - Regression')
    plt.savefig(f'actual_vs_predicted_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png')
    plt.close()
    
    return {
        'classification': {
            'train': {'accuracy': train_accuracy, 'f1': train_f1},
            'test': {'accuracy': test_accuracy, 'f1': test_f1}
        },
        'regression': {
            'train': {'r2': train_r2, 'rmse': train_rmse, 'mae': train_mae},
            'test': {'r2': test_r2, 'rmse': test_rmse, 'mae': test_mae}
        }
    }

# Prepare data for error analysis
X = df.drop(["Attrition", "EmployeeNumber"], axis=1)
y_cls = df["Attrition"]
y_reg = df["FutureSalary"]

# Perform error analysis
error_metrics = perform_error_analysis(
    clf, best_reg, 
    X_cls, y_cls,
    X_reg=df[features_reg], y_reg=y_reg,
    scaler_cls=scaler_cls, scaler_reg=scaler_reg
)

# Save error metrics
with open(f'error_metrics_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json', 'w') as f:
    json.dump(error_metrics, f, indent=4)


Classification Metrics:
Training - Accuracy: 0.871, F1 Score: 0.510
Testing  - Accuracy: 0.898, F1 Score: 0.500

Regression Metrics (log scale):
Training - R² Score: -1.841, RMSE: 8433.661, MAE: 6788.957
Testing  - R² Score: -2.315, RMSE: 8742.403, MAE: 7305.623




In [280]:
# Save outputs
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"attrition_analysis_results_{timestamp}.csv"

# Save the processed dataset to a CSV file
df.to_csv(output_file, index=False)
logging.info("Results saved to %s", output_file)

# Save the trained models and scalers
joblib.dump(clf, f"attrition_classifier_{timestamp}.pkl")
joblib.dump(best_reg, f"salary_regressor_{timestamp}.pkl")
joblib.dump(scaler_cls, f"scaler_cls_{timestamp}.pkl")
joblib.dump(scaler_reg, f"scaler_reg_{timestamp}.pkl")

# Save feature importance results
feature_importance.to_csv(f"feature_importance_{timestamp}.csv", index=False)
logging.info("Models, scalers, and feature importance saved with timestamp %s", timestamp)

In [281]:
# Check generated files
import os
files = os.listdir()
print("Generated files:")
for f in files:
    if f.startswith(('attrition', 'feature', 'scaler')):
        print(f)

Generated files:
attrition_analysis_20250504_130932.log
attrition_analysis_results_20250504_221247.csv
attrition_classifier_20250504_221247.pkl
feature_importance_20250504_221247.csv
feature_importance_plot.png
scaler_cls_20250504_221247.pkl
scaler_reg_20250504_221247.pkl


In [282]:
import os
print(os.getcwd())

c:\Users\Srimanth L\OneDrive\Desktop\C Programs\venv\bin


In [283]:
def main():
    filepath = r"c:\Users\Srimanth L\OneDrive\Desktop\C Programs\WA_Fn-UseC_-HR-Employee-Attrition.csv"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    logging.info("Starting analysis pipeline")
    
    try:
        # 1. Load and preprocess
        df = load_data(filepath)
        df, encoders = preprocess_data(df)
        
        # 2. Basic feature engineering
        df = feature_engineering(df)
        
        # 3. Initial feature creation (without FutureSalary dependency)
        df, basic_features = advanced_feature_engineering(df)
        
        # 4. Create future salary
        df = simulate_future_salaries(df)
        
        # 5. Add log transformation for future salary
        df["LogFutureSalary"] = np.log1p(df["FutureSalary"])
        logging.info("Added LogFutureSalary transformation")
        
        # 6. Model enhancement
        enhanced_models = enhance_models(df)
        
        # 7. Classification - Store feature names before scaling
        X = df.drop(["Attrition", "EmployeeNumber"], axis=1)
        feature_names = X.columns
        clf, scaler_cls, X_cls, y_proba_cls, top_features = classify_attrition(df)
        
        # 8. Regression with model tuning
        df, best_reg, scaler_reg, features_reg, feature_importance = regress_future_salary(
            df, X_cls, y_proba_cls, top_features
        )
        
        # 9. Feature importance visualization
        visualize_feature_importance(clf, feature_names)
        
        # 10. Error analysis with improved filtering
        likely_to_stay_mask = df["P_Stay"] > 0.7  # Increased threshold
        filtered_features = df[likely_to_stay_mask][features_reg]
        
        # Scale features properly
        X_reg_scaled = scaler_reg.transform(filtered_features)
        y_reg_log = np.log1p(df[likely_to_stay_mask]["FutureSalary"])
        
        error_metrics = perform_error_analysis(
            clf=clf,
            best_reg=best_reg,
            X_cls=X_cls,
            y_cls=df["Attrition"],
            X_reg=X_reg_scaled,
            y_reg=y_reg_log,  # Use log-transformed target
            scaler_cls=scaler_cls,
            scaler_reg=scaler_reg
        )
        
        # 11. Save results
        output_file = f"attrition_analysis_results_{timestamp}.csv"
        df.to_csv(output_file, index=False)
        logging.info("Results saved to %s", output_file)
        
        # Save models and scalers
        joblib.dump(clf, f"attrition_classifier_{timestamp}.pkl")
        joblib.dump(best_reg, f"salary_regressor_{timestamp}.pkl")
        joblib.dump(scaler_cls, f"scaler_cls_{timestamp}.pkl")
        joblib.dump(scaler_reg, f"scaler_reg_{timestamp}.pkl")
        
        # Save analysis results
        feature_importance.to_csv(f"feature_importance_{timestamp}.csv", index=False)
        with open(f'error_metrics_{timestamp}.json', 'w') as f:
            json.dump(error_metrics, f, indent=4)
        
        logging.info("Analysis pipeline completed successfully")
        
    except Exception as e:
        logging.error("Error in analysis pipeline: %s", str(e))
        raise
    
    return df, error_metrics, enhanced_models, basic_features

# Execute if running as main script
if __name__ == "__main__":
    df, metrics, enhanced_models, features = main()

Fitting 3 folds for each of 4 candidates, totalling 12 fits

Classification Metrics:
Training - Accuracy: 0.884, F1 Score: 0.573
Testing  - Accuracy: 0.884, F1 Score: 0.393

Regression Metrics (log scale):
Training - R² Score: 1.000, RMSE: 0.005, MAE: 0.004
Testing  - R² Score: 1.000, RMSE: 0.008, MAE: 0.006
