In [None]:
%pip install pandas matplotlib seaborn scikit-learn imblearn

In [None]:
import pandas as pd
import numpy as np

## Step 1: Perform data quality checks

In [None]:
df= pd.read_csv("HR_comma_sep.csv")

# get head rows
df.head()

In [None]:
# Gather basic information about the data
df.info()

In [None]:
# Gather descriptive statistics about the data

df.describe()

In [None]:
df.columns

In [None]:
# cleaning data
df= df.rename(columns={"average_montly_hours":"average_monthly_hours","Work_accident":"work_accident","sales":"department","number_project":"number_of_projects","time_spend_company":"years_with_company","promotion_last_5years":"promotions_in_last_5_years"})

In [None]:
# new columns
df.columns


In [None]:
# check missing values
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
# get duplicated rows
duplicated_rows = df[df.duplicated(keep=False)]

In [None]:
duplicated_rows

# Step 2: Exploratory Data Analysis (EDA)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# extract numerical features
df_numeric = df.select_dtypes(include=['number'])

print("Numerical Features:")
print(df_numeric.columns)



In [None]:
df_numeric

In [None]:
# Identify outliers using IQR for numerical columns
def detect_outliers(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[col] > upper_bound) | (data[col] < lower_bound)]
    return outliers


#outlier columns
outlier_cols=["satisfaction_level","average_monthly_hours","years_with_company", "number_of_projects"]

df_outliers= df_numeric[outlier_cols]
outliers = {}
for col in df_outliers:
    outliers[col] = detect_outliers(df_numeric, col)
    print(f'No of outliers for {col}', len(outliers[col]))


# Plot boxplots for each numerical column
for col in df_outliers:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot for {col}')
    plt.show()

In [None]:
outliers.keys()

In [None]:
outliers_left_years_with_company= outliers['years_with_company'][outliers['years_with_company']['left']==1]

len(outliers_left_years_with_company)

#### Interpretation

Most employees tend to leave or have shorter tenures (~4 years), while a there are few employees have been retained for significantly longer periods(>6 years).

# correlation heatmap

In [None]:
# Plot a correlation heatmap
plt.figure(figsize=(12, 8))
heatmap = sns.heatmap(df_numeric.corr(), vmin=-1, vmax=1, annot=True, cmap=sns.color_palette("vlag", as_cmap=True))
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':14}, pad=12)

### Interpretation: 

The correlation heatmap suggests that the number of projects, monthly hours, and evaluation scores all have some positive correlation with each other, and whether an employee leaves is negatively correlated with their satisfaction level.

1) The employees with low satisfaction level are most likely and tend to leave (negative correlation  -0.39 )
2) The Average monthly hours increases with number of projects the employee worked on (positive correaltion 0.42)
3) The employees with higher evaluation score are assigned with more projects( positive corelation 0.35)
4) The number of projects is not a significant factor in predicting employee turnover(0.024)
5) The employees working for longer years with company, has a medium chance of leaving(0.14)

### Distribution Plots

In [None]:
# Distribution Plot for satisfaction level

satisfaction_level_stay = df_numeric[df_numeric['left']==0]['satisfaction_level']
satisfaction_level_left = df_numeric[df_numeric['left']==1]['satisfaction_level']

sns.histplot(data=df_numeric, x='satisfaction_level', hue='left', multiple='dodge',kde=True,bins=10)

# sns.histplot(df_numeric['satisfaction_level'], kde=True,bins=30)
plt.title('Distribution of Employee Satisfaction')
plt.show()

In [None]:
# Distribution Plot for last_evaluation level
last_evaluation_stay = df_numeric[df_numeric['left']==0]['last_evaluation']
last_evaluation_left = df_numeric[df_numeric['left']==1]['last_evaluation']

sns.histplot(data=df_numeric, x='last_evaluation', hue='left', multiple='dodge',kde=True,bins=10)

# sns.histplot(df_numeric['satisfaction_level'], kde=True,bins=30)
# plt.title('Distribution of Employee Satisfaction')
# plt.show()
# sns.histplot(df_numeric['last_evaluation'], kde=True,bins=30)
plt.title('Distribution of Employee Last Evaluation')
plt.show()

In [None]:
# Distribution Plot for last_evaluation level
average_monthly_hours_stay = df_numeric[df_numeric['left']==0]['average_monthly_hours']
average_monthly_hours_left = df_numeric[df_numeric['left']==1]['average_monthly_hours']

sns.histplot(data=df_numeric, x='average_monthly_hours', hue='left', multiple='dodge',kde=False,bins=20)

# sns.histplot(df_numeric['satisfaction_level'], kde=True,bins=30)
# plt.title('Distribution of Employee Satisfaction')
# plt.show()
# sns.histplot(df_numeric['last_evaluation'], kde=True,bins=30)
plt.title('Distribution of Employee average_monthly_hours')
plt.show()

In [None]:
# Create a bar plot to compare the number of projects between employees who left and those who stayed
no_of_projects_stay = df_numeric[df_numeric['left']==0]['number_of_projects']
no_of_projects_left = df_numeric[df_numeric['left']==1]['number_of_projects']

sns.histplot(data=df_numeric, x='number_of_projects', hue='left', multiple='dodge',kde=False,bins=10)

plt.title('Employee Project Count: Left vs Stayed')
plt.xlabel('Number of Projects')
plt.show()


# Interpretation from distribution plots

1) Again the employees with low satisfaction score are tend to leave
2) The large distibution of employee leaving with low evaluation score and lesser number of average monthly hours, suggests that might have been asked to leave the company
3) The prominent number of employee leaving with high last evaluation score, may be because of assigning larger number of projects causing burnout due to working for longer monthly average times
4) The most optimal projects to be given are between 3 or 4 Projects


## 3. K-Means Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
clustered_columns= ["satisfaction_level", "last_evaluation"]
df_left = df_numeric[df_numeric['left'] == 1]

clustering_data = df_left[clustered_columns]

clustering_data

In [None]:
# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=123)
clusters = kmeans.fit_predict(clustering_data)

# Add the cluster labels to the original dataframe
df_left['cluster'] = clusters


In [None]:
df_left

In [None]:
df_left["cluster"].value_counts()

In [None]:
# Visualize the clustering results
sns.scatterplot(x='satisfaction_level', y='last_evaluation', hue='cluster', data=df_left, palette='viridis', s=100)
plt.title('K-means Clustering of Employees Who Left')
plt.xlabel('Satisfaction Level')
plt.ylabel('Last Evaluation')
plt.legend(title='Cluster')
plt.show()

### K-Means Clustering Analysis

Cluster-0 - Purple(Low-Satisfaction(<0.3) and High Last Evaluation (>0.75 ))
1) Indicates that the employees are not satisfied even though they are highest performers, may be because of high burnout or not valued

Cluster-1- Blue (Moderate- Satisfaction(Between 0.3 and 0.6) and Low Last Evaluation(<0.6))
1) Indicates they are satisfied but lack performance, introducing performance improvement plans can make them more productive


Cluster-2- Yellow (High-Satisfaction Level(> 0.7) and High Last Evaluation(>0.8))
1) Indicates they are the most productive employees and are also satisfied, but they might have left because of getting better opportunites.

## SMOTE Technique

#### Pre-process the data

In [None]:
df_numeric.columns

In [None]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_columns


In [None]:
df_dummies=  pd.get_dummies(df, columns= categorical_columns, drop_first=True)
df_dummies

### split the dataset

In [None]:
from sklearn.model_selection import train_test_split

X = df_dummies.drop('left',axis=1)
y = df_dummies['left']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

#### Upsampling using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=123)
X_train_res, y_train_res = sm.fit_resample(X_train,y_train)

In [None]:
X_train.shape

### Apply Cross fold validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report


# Define the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Train the models and make predictions
predictions_train = {}
predictions_test={}
probs_train = {}
probs_test={}

for model_name, model in models.items():
    model.fit(X_train_res, y_train_res)
    # Apply 5-fold cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

    # Display the cross-validation scores
    mean_cv_score = np.mean(cv_scores)
    print(f'{model_name} -', cv_scores, mean_cv_score)

    predictions_train[model_name] = model.predict(X_train)
    predictions_test[model_name]=model.predict(X_test)
    probs_train[model_name] = model.predict_proba(X_train)[:, 1]
    probs_test[model_name] = model.predict_proba(X_test)[:, 1]


    

### Identify Best Model with metrics

In [None]:
# Print classification reports and evaluate models
for model_name in models:
    print(f'Classification Report for {model_name} -  Train:\n')
    print(classification_report(y_train, predictions_train[model_name]))
    roc_auc = roc_auc_score(y_train, probs_train[model_name])
    print(f'ROC AUC for {model_name} - Train: {roc_auc:.2f}\n')

    print(f'Classification Report for {model_name} -  Test:\n')
    print(classification_report(y_test, predictions_test[model_name]))
    roc_auc = roc_auc_score(y_test, probs_test[model_name])
    print(f'ROC AUC for {model_name}- Test: {roc_auc:.2f}\n')



In [None]:
fig, ax = plt.subplots(1, 2, figsize = (22,8))

# Plot ROC curves
for model_name in models:
    fpr_train, tpr_train, _ = roc_curve(y_train, probs_train[model_name])
    auc = roc_auc_score(y_train, probs_train[model_name])
    ax[0].plot(fpr_train, tpr_train, label=f'{model_name} (AUC = {auc:.2f})')

    fpr_test, tpr_test, _ = roc_curve(y_test, probs_test[model_name])
    auc = roc_auc_score(y_test, probs_test[model_name])
    ax[1].plot(fpr_test, tpr_test, label=f'{model_name} (AUC = {auc:.2f})')


ax[0].plot([0, 1], [0, 1], 'k--')
ax[0].set_title('ROC Curve (Train Set)')
ax[0].set_xlabel('False Positive Rate')
ax[0].set_ylabel('True Positive Rate')
ax[0].legend(loc='lower right')



ax[1].plot([0, 1], [0, 1], 'k--')
ax[1].set_title('ROC Curve (Test Set)')
ax[1].set_xlabel('False Positive Rate')
ax[1].set_ylabel('True Positive Rate')
ax[1].legend(loc='lower right')


In [None]:
# Confusion matrices

for model_name in models:
    fig, ax = plt.subplots(1, 2, figsize = (22,8))

    cm_train = confusion_matrix(y_train, predictions_train[model_name])
    cm_test=confusion_matrix(y_test,predictions_test[model_name])
    sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues', xticklabels=['Stayed', 'Left'], yticklabels=['Stayed', 'Left'],ax=ax[0])
    sns.heatmap(cm_test, annot=True, fmt='d',  xticklabels=['Stayed', 'Left'], yticklabels=['Stayed', 'Left'],ax=ax[1])

    ax[0].set_title(f'Confusion Matrix for {model_name} - Train')
    ax[0].set_xlabel('Predicted')
    ax[0].set_ylabel('Actual')

    ax[1].set_title(f'Confusion Matrix for {model_name} - Test')
    ax[1].set_xlabel('Predicted')
    ax[1].set_ylabel('Actual')
    plt.show()

### Recall or precision

The goal is to identify employees who are likely to leave so that the company can take proactive retention measures.
Recall ensures that the model correctly identifies most of the employees who are likely to leave (left = 1), minimizing false negatives (employees who leave but were predicted to stay).

### Apply best model on test data

In [None]:
#from the above classification report,  we can identify that RandomForest is the best model based on recall

model = models['Random Forest']  # Assuming Random Forest is the best model
y_test_prob = model.predict_proba(X_test)[:, 1]

df_test = X_test.copy()
df_test['left_prob'] = y_test_prob

In [None]:
df_test

In [None]:
def retention_strategy(prob):
    if prob < 0.20:
        return 'Safe Zone (Green)'
    elif prob < 0.60:
        return 'Low-Risk Zone (Yellow)'
    elif prob < 0.90:
        return 'Medium-Risk Zone (Orange)'
    else:
        return 'High-Risk Zone (Red)'

df_test['retention_zone'] = df_test['left_prob'].apply(retention_strategy)

In [None]:
df_test