In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import boxcox

# Load dataset
file_path = './WA_Fn-UseC_-HR-Employee-Attrition.csv'
data = pd.read_csv(file_path)

# Histograms for selected columns
columns_to_plot = [
    ("MonthlyIncome", "Monthly Income"),
    ("TotalWorkingYears", "Total Working Years"),
    ("YearsInCurrentRole", "Years in Current Role")
]
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for i, (col, label) in enumerate(columns_to_plot):
    sns.histplot(data[col], kde=True, bins=30, ax=axes[i])
    axes[i].set_xlabel(label, fontsize=14)
    axes[i].set_ylabel("Frequency", fontsize=14)
    axes[i].text(0.95, 0.95, f"({chr(97 + i)})", transform=axes[i].transAxes, ha="center", va="center", fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()



In [None]:
# Box plots
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
sns.boxplot(data=data, x="JobRole", y="MonthlyIncome", palette="viridis", ax=axes[0])
axes[0].set_xlabel("Job Role", fontsize=12)
axes[0].set_ylabel("Monthly Income", fontsize=12)
axes[0].tick_params(axis="x", rotation=90, labelsize=10)
axes[0].text(-0.1, 1.05, '(a)', transform=axes[0].transAxes, fontsize=14, fontweight='bold')
sns.boxplot(data=data, x="Age", y="YearsAtCompany", palette="viridis", ax=axes[1])
axes[1].set_xlabel("Age", fontsize=12)
axes[1].set_ylabel("Years at Company", fontsize=12)
axes[1].text(-0.1, 1.05, '(b)', transform=axes[1].transAxes, fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Scatter plots
scatter_pairs = [
    ("MonthlyIncome", "TotalWorkingYears"),
    ("MonthlyIncome", "YearsAtCompany"),
    ("MonthlyIncome", "YearsInCurrentRole"),
    ("TotalWorkingYears", "YearsAtCompany"),
    ("TotalWorkingYears", "YearsInCurrentRole"),
    ("YearsAtCompany", "YearsInCurrentRole")
]
fig, axes = plt.subplots(2, 3, figsize=(18, 11))
for i, (x_col, y_col) in enumerate(scatter_pairs):
    sns.scatterplot(data=data, x=x_col, y=y_col, ax=axes[i], color='blue')
    axes[i].set_xlabel(x_col.replace("TotalWorkingYears", "Total Working Years")
                        .replace("YearsAtCompany", "Years at Company")
                        .replace("YearsInCurrentRole", "Years in Current Role")
                        .replace("MonthlyIncome", "Monthly Income"), fontsize=14)
    axes[i].set_ylabel(y_col.replace("TotalWorkingYears", "Total Working Years")
                        .replace("YearsAtCompany", "Years at Company")
                        .replace("YearsInCurrentRole", "Years in Current Role")
                        .replace("MonthlyIncome", "Monthly Income"), fontsize=14)
    axes[i].text(0.02, 0.98, f"({chr(97 + i)})", transform=axes[i].transAxes, fontsize=16, verticalalignment='top', horizontalalignment='left')
plt.tight_layout()
plt.show()


In [None]:

# Correlation heatmap
relevant_columns = [
    'Age', 'MonthlyIncome', 'YearsAtCompany', 'TotalWorkingYears',
    'NumCompaniesWorked', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
    'YearsWithCurrManager', 'PercentSalaryHike', 'JobSatisfaction',
    'WorkLifeBalance', 'JobInvolvement'
]
relevant_data = data[relevant_columns]
corr_matrix_relevant = relevant_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix_relevant, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, square=True)
plt.show()

# Linear regression
predictors = [
    "TotalWorkingYears", "YearsSinceLastPromotion", "YearsAtCompany",
    "YearsInCurrentRole", "TrainingTimesLastYear", "NumCompaniesWorked"
]
target = "MonthlyIncome"
X = data[predictors]
y = data[target]
X = sm.add_constant(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = sm.OLS(y_train, X_train).fit()
y_pred = model.predict(X_test)
residuals = y_test - y_pred

# Residual plots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
sns.scatterplot(x=y_pred, y=residuals, alpha=0.6, color="blue", ax=axes[0])
axes[0].axhline(y=0, color="red", linestyle="--")
axes[0].set_xlabel("Predicted Values", fontsize=14)
axes[0].set_ylabel("Residuals", fontsize=14)
sns.histplot(residuals, kde=True, color="green", ax=axes[1])
axes[1].set_xlabel("Residuals", fontsize=14)
axes[1].set_ylabel("Frequency", fontsize=14)
sns.scatterplot(x=y_test, y=y_pred, color="blue", alpha=0.6, ax=axes[2])
axes[2].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color="red", linestyle="--", linewidth=2)
axes[2].set_xlabel("Actual Monthly Income", fontsize=14)
axes[2].set_ylabel("Predicted Monthly Income", fontsize=14)
plt.tight_layout()
plt.show()

# Box-Cox transformation
X_transformed = X.copy()
for col in predictors:
    if (X[col] > 0).all():
        X_transformed[col], _ = boxcox(X[col] + 1e-6)
y_transformed, _ = boxcox(y + 1e-6)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_transformed, test_size=0.2, random_state=42)
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)
model_transformed = sm.OLS(y_train, X_train_const).fit()
y_pred_transformed = model_transformed.predict(X_test_const)
mse_transformed = mean_squared_error(y_test, y_pred_transformed)
r2_transformed = r2_score(y_test, y_pred_transformed)

print("Regression Results After Box-Cox Transformation:")
print(model_transformed.summary())
print(f"Transformed Model - MSE: {mse_transformed:.2f}, R-squared: {r2_transformed:.2f}")

# Plot actual vs predicted for transformed model
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_transformed, color='blue', alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2)
plt.title("Actual vs Predicted Values (Box-Cox Transformed)")
plt.xlabel("Actual Monthly Income")
plt.ylabel("Predicted Monthly Income")
plt.grid(True)
plt.show()

# Print Box-Cox lambdas
for col in predictors:
    if (X[col] > 0).all():
        _, lambda_col = boxcox(X[col] + 1e-6)
        print(f"Optimal lambda for {col}: {lambda_col}")
