In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import necessary libraries

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
import numpy as np

### Read Input file

In [None]:
# Read CSV file
data = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
data.head()

In [None]:
data.shape

### Identify NA Values (if any)

In [None]:
# Identify NULL Values if any
data.isnull().sum()

### Identify Columns which can be removed

In [None]:
data.describe()

## Observations - 


* Constant data Columns (Can be removed) - EmployeeCount, StandardHours
* Category-type variables - EnvironmentSatisfaction, JobInvolvement, JobLevel, RelationshipSatisfaction, WorkLifeBalance
* Unique Identifiers - EmployeeNumber

Now let's check for categorical variables if they have uniqueness or not:

In [None]:
# Check Unique Values for Categorical data
print('Unique Attrition Values: ' + str(data.Attrition.unique()) + '\n')
print('Unique Business Travel Values: ' + str(data.BusinessTravel.unique()) + '\n')
print('Unique Dept Values: ' + str(data.Department.unique()) + '\n')
print('Unique Education Field Values: ' + str(data.EducationField.unique()) + '\n')
print('Unique Gender Values: ' + str(data.Gender.unique()) + '\n')
print('Unique Job Role Values: ' + str(data.JobRole.unique()) + '\n')
print('Unique Marital Status Values: ' + str(data.MaritalStatus.unique()) + '\n')
print('Unique Over18 Values: ' + str(data.Over18.unique()) + '\n')
print('Unique OverTime Values: ' + str(data.OverTime.unique()) + '\n')

## Observations - 

* Constant data Columns (Can be removed) - Over18
* Can be converted from Categorical - Numerical - Attrition, BusinessTravel, Dept, Gender, MaritalStatus, OverTime



### Remove unnecessary columns & identify the total no. of remaining columns

In [None]:
# Remove unnecessary columns
data.drop(columns=["EmployeeCount", "StandardHours", "Over18", "EmployeeNumber"], inplace=True)
data.shape  # Now we have 31 columns only

### Replace Categorical - Numerical (wherever needed)

In [None]:
# Convert Columns - Attrition, BusinessTravel, Dept, Gender, MaritalStatus, OverTime
data.Attrition.replace({'Yes': 1, 'No': 0}, inplace=True)
data.BusinessTravel.replace({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}, inplace=True)
data.Department.replace({'Sales': 0, 'Research & Development': 1, 'Human Resources': 2}, inplace=True)
data.Gender.replace({'Female': 0, 'Male': 1}, inplace=True)
data.MaritalStatus.replace({'Single': 0,'Married': 1, 'Divorced': 2}, inplace=True)
data.OverTime.replace({'No': 0, 'Yes': 1}, inplace=True)
data.EducationField.replace({'Life Sciences': 0, 'Medical': 1, 'Marketing': 2, 'Technical Degree': 3, 'Human Resources': 4, 'Other': 5}, inplace=True)
data.JobRole.replace({
    'Sales Executive': 0, 
    'Research Scientist': 1, 
    'Laboratory Technician': 2,
    'Manufacturing Director': 3,
    'Healthcare Representative': 4,
    'Manager': 5,
    'Sales Representative': 6,
    'Research Director': 7,
    'Human Resources': 8
}, inplace=True)

Describe our data and confirm whether changes are reflected

In [None]:
data.describe()

## Exploratory Data Analysis

Identify Difference between mean and median values.
[This helps to understand whether columns are skewed or not]

In [None]:
temp = None
temp = pd.DataFrame({
    "Median Values": data.median(), 
    "Mean Values": data.mean(), 
    "Standard Deviation": data.std(),
    "Skewness": data.skew() # Ideal range: -1 to +1
})

# Sort values based on skewness
temp = temp.sort_values(by='Skewness')

temp.head(50)

In [None]:
data.describe()

### Let us identify whether our target variable is biased or not. If yes, then by how much

In [None]:
total_rows = data.shape[0]
no_val = data.Attrition.value_counts()[0]
yes_val = data.Attrition.value_counts()[1]
print('Percentage of NO Values: ' + str((no_val/total_rows) * 100))
print('Percentage of YES Values: ' + str((yes_val/total_rows) * 100))

plt.figure(figsize = (7, 7))
plt.pie([yes_val, no_val], labels=['YES', 'NO'], autopct='%1.0f%%', colors = ['lightgreen','#66b3ff'])
plt.title("Observed Class Imbalance in Attrition values", fontsize=20)
plt.show()

### Let us identify whether our dataset is gender biased or not. If yes, then by how much

In [None]:
print(data.Gender.value_counts())

total = data.Gender.value_counts()[0] + data.Gender.value_counts()[1]
per_men_data = (data.Gender.value_counts()[1] / total) * 100 
per_women_data = (data.Gender.value_counts()[0] / total) * 100 

plt.figure(figsize = (7, 7))
plt.pie([per_men_data, per_women_data], labels=['Men', 'Women'], autopct='%1.0f%%')
plt.title('Gender wise Data Biasness', fontsize=20)
plt.show()

### Women are not willing to switch jobs more often as men do. Also, their work-life balance is very low as compared to men.

In [None]:
plt.figure(figsize=(8, 5))
ax = sns.barplot(x = data.Gender , y = data.WorkLifeBalance, estimator = np.sum, hue = data.Attrition)
ax.set_xticklabels(('Women', 'Men'))
plt.title('How Worklife Balance affects Attrition Rate')
plt.show()

### Married people have a maximum Work-Life Balance rate than others.

In [None]:
plt.figure(figsize=(8, 5))
ax = sns.barplot(x = data.Gender , y = data.WorkLifeBalance, estimator = np.sum, hue = data.MaritalStatus)
ax.set_xticklabels(('Women', 'Men'))
plt.show()

### More men do Business Travels than women. Also, as seen above their Work-Life Balance is higher than women.

In [None]:
plt.figure(figsize=(8, 5))
ax = sns.barplot(x = data.Gender , y = data.BusinessTravel, estimator = np.sum, hue = data.Attrition)
ax.set_xticklabels(('Women', 'Men'))
plt.title('How Business Travel affects Attrition Rate')
plt.show()

### Let's see how Age affects Attrition Rate:
* Women within 25-35 age have more attrition rate, whereas men show higher attrition rates until late 40s
* Later, it is only after women reach their 50s, they show higher attrition rates and start switching jobs

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x=data.Gender, y=data.Age, hue=data.Attrition, data=data, linewidth=2.5)
ax.set_xticklabels(('Women', 'Men'))
plt.title('Age-wise Attrition Rate')
plt.show()

### Attrition Rate is higher in people who are Non-Business Travelers  and who stay far from office

In [None]:
plt.figure(figsize=(10, 6))
plt.title('How Distance from Home affects Attrition Rate')
ax = sns.barplot(x = data.BusinessTravel , y = data.DistanceFromHome, estimator = np.median, hue = data.Attrition, palette='Set1')
ax.set_xticklabels(('Non-Travel', 'Travel_Rarely', 'Travel_Frequently'))
plt.show()

* Sales Dept has lowest employee retention rate, since most attrition rate is observed in this dept.
* R&D Dept has the highest employee retention rate, since least attrition rate is seen in this dept.

In [None]:
plt.figure(figsize=(10, 6))
plt.title('Department wise Attrition Rate')
ax = sns.barplot(x=data.Department, y=data.JobRole, hue=data.Attrition, orient='v', palette='Set2')
ax.set_xticklabels(('Sales', 'Research & Development', 'Human Resources'))
plt.show()

* Sales Employees have the highest paid Jobs
* R&D Dept Employees have mid-pay jobs
* The lesser the Salaries, the more their Attrition Rate
* The amount of people residing in the higher-salary segment are more in Sales Dept and least in the R&D Dept

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x=data.Department, y=data.MonthlyIncome, hue=data.Attrition, data=data, linewidth=2.5, palette='Set2')
ax.set_xticklabels(('Sales', 'Research & Development', 'Human Resources'))
plt.title('Department wise Monthly Income')
plt.show()

### Identifying Correlation furthermore using Correlation plot

### Observations - 

* Job-Level, Monthly Income increases as Age and the total number of working years (i.e. Work Experience) increases.
* The more the amount of Over Time employees do, that more their chances of switching jobs
* Salary Hike is observed the most among those who have the highest performance ratings

In [None]:
plt.figure(figsize=(30, 20))
plt.title('Correlation between variables')
sns.heatmap(data.corr(), annot=True, cmap='Blues')

## Model Development

In [None]:
data.columns

In [None]:
from sklearn.model_selection import train_test_split
data_x = data.iloc[:, 0:30]
data_y = data.iloc[:, 1]

data_x.drop(columns=["Attrition"], inplace=True)

### Resolving Target Class Imbalance using SMOTE

In [None]:
data_x_train, data_x_test, data_y_train, data_y_test = train_test_split(data_x, data_y, test_size = .2, random_state=20)
print("-----------------------")
print(data_x_train.shape)
print(data_y_train.shape)
print("-----------------------")
print(data_x_test.shape)
print(data_y_test.shape)
print("-----------------------")

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(data_y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(data_y_train == 0))) 
  
# import SMOTE module from imblearn library 
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=0)
oversampled_x_train, oversampled_y_train = sm.fit_resample(data_x_train, data_y_train)

print("After OverSampling, the shape of oversampled_x_train: {}".format(oversampled_x_train.shape))
print("After OverSampling, the shape of oversampled_y_train: {} \n".format(oversampled_y_train.shape))
  
print("After OverSampling, counts of label '1': {}".format(sum(oversampled_y_train == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(oversampled_y_train == 0))) 

oversampled_y_train = pd.Series(oversampled_y_train)


# Plot on a Pie chart
total_rows = data.shape[0]

plt.figure(figsize = (7, 7))
plt.pie([sum(oversampled_y_train == 1), sum(oversampled_y_train == 0)], labels=['YES', 'NO'], autopct='%1.0f%%', colors = ['lightgreen','#66b3ff'])
plt.title("Resolved Class Imbalance in Attrition values", fontsize=20)
plt.show()

In [None]:
oversampled_x_train.head()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(class_weight='balanced')

log_reg.fit(oversampled_x_train, oversampled_y_train)

# Predict values based on this fittest model
log_pred = log_reg.predict(data_x_test)

from sklearn.metrics import confusion_matrix, roc_auc_score
log_conf = confusion_matrix(data_y_test, log_pred)

# Visulize this Confusion Matrix neatly using seaborn
plt.figure(figsize = (8,5))
sns.heatmap(log_conf, annot=True, cmap='Blues', linewidths=.5)

log_accuracy = log_conf.diagonal().sum() / log_conf.sum()
print("Accuracy: " + str(log_accuracy))

log_prec = log_conf[1,1] / (log_conf[0,1] + (log_conf[1,1]))
print("Precision: " + str(log_prec))

log_sens = log_conf[1,1] / (log_conf[1,0] + (log_conf[1,1]))
print("Sensitivity: " + str(log_sens))

log_spec = log_conf[0,0] / (log_conf[0,0] + (log_conf[0,1]))
print("Specificity: " + str(log_spec))

log_roc_auc_score = roc_auc_score(data_y_test, log_pred)
print("ROC AUC Score: " + str(log_roc_auc_score))

log_F1 = 2 * (log_prec * log_sens) / (log_prec + log_sens)
print('F1 Score: ' + str(log_F1))

results_normalized = pd.DataFrame({"Actual Values":data_y_test,"Predicted Values":log_pred})
results_normalized.tail()

### Improving Accuracy of Logistic Regression using XGBoost

In [None]:
# Applying XGBoost to Logistic Regression

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


# fit model no training data
xg_model = XGBClassifier()
xg_model.fit(oversampled_x_train, oversampled_y_train)


# Prediction using XGBoost
y_pred = xg_model.predict(data_x_test)
predictions = [round(value) for value in y_pred]


# Evaluate predictions
xg_log_conf = confusion_matrix(data_y_test, predictions)

# Visulize this Confusion Matrix neatly using seaborn
plt.figure(figsize = (8,5))
sns.heatmap(xg_log_conf, annot=True, cmap='Blues', linewidths=.5)

xg_log_accuracy = xg_log_conf.diagonal().sum() / log_conf.sum()
print("Accuracy: " + str(xg_log_accuracy))

xg_log_prec = xg_log_conf[1,1] / (xg_log_conf[0,1] + (xg_log_conf[1,1]))
print("Precision: " + str(xg_log_prec))

xg_log_sens = xg_log_conf[1,1] / (xg_log_conf[1,0] + (xg_log_conf[1,1]))
print("Sensitivity: " + str(xg_log_sens))

xg_log_spec = xg_log_conf[0,0] / (xg_log_conf[0,0] + (xg_log_conf[0,1]))
print("Specificity: " + str(xg_log_spec))

xg_log_roc_auc_score = roc_auc_score(data_y_test, log_pred)
print("ROC AUC Score: " + str(xg_log_roc_auc_score))

xg_log_F1 = 2 * (xg_log_prec * xg_log_sens) / (xg_log_prec + xg_log_sens)
print('F1 Score: ' + str(xg_log_F1))

results_normalized = pd.DataFrame({"Actual Values":data_y_test,"Predicted Values":log_pred})
results_normalized.tail()

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(oversampled_x_train, oversampled_y_train)

dt_pred = dt.predict(data_x_test)
dt_conf = confusion_matrix(data_y_test, dt_pred)

# Visulize this Confusion Matrix neatly using seaborn
plt.figure(figsize = (8, 5))
sns.heatmap(dt_conf, annot=True, cmap='Blues', linewidths=.5)

dt_accuracy = dt_conf.diagonal().sum() / dt_conf.sum()
print("Accuracy: " + str(dt_accuracy))

dt_prec = dt_conf[1,1] / (dt_conf[0,1] + (dt_conf[1,1]))
print("Precision: " + str(dt_prec))

dt_sens = dt_conf[1,1] / (dt_conf[1,0] + (dt_conf[1,1]))
print("Sensitivity: " + str(dt_sens))

dt_spec = dt_conf[0,0] / (dt_conf[0,0] + (dt_conf[0,1]))
print("Specificity: " + str(dt_spec))

dt_roc_auc_score = roc_auc_score(data_y_test, dt_pred)
print("ROC AUC Score: " + str(dt_roc_auc_score))

dt_F1 = 2 * (dt_prec * dt_sens) / (dt_prec + dt_sens)
print('F1 Score: ' + str(dt_F1))

results_normalized = pd.DataFrame({"Actual Values":data_y_test,"Predicted Values":dt_pred})
results_normalized.tail()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(oversampled_x_train, oversampled_y_train)

rf_pred = rf.predict(data_x_test)
rf_conf = confusion_matrix(data_y_test, rf_pred)

# Visulize this Confusion Matrix neatly using seaborn
plt.figure(figsize = (8,5))
sns.heatmap(rf_conf, annot=True, cmap='Blues', linewidths=.5)

rf_accuracy = rf_conf.diagonal().sum() / rf_conf.sum()
print("Accuracy: " + str(rf_accuracy))

rf_prec = rf_conf[1,1] / (rf_conf[0,1] + (rf_conf[1,1]))
print("Precision: " + str(rf_prec))

rf_sens = rf_conf[1,1] / (rf_conf[1,0] + (rf_conf[1,1]))
print("Sensitivity: " + str(rf_sens))

rf_spec = rf_conf[0,0] / (rf_conf[0,0] + (rf_conf[0,1]))
print("Specificity: " + str(rf_spec))

rf_roc_auc_score = roc_auc_score(data_y_test, rf_pred)
print("ROC AUC Score: " + str(rf_roc_auc_score))

rf_F1 = 2 * (rf_prec * rf_sens) / (rf_prec + rf_sens)
print('F1 Score: ' + str(rf_F1))

results_normalized = pd.DataFrame({"Actual Values":data_y_test,"Predicted Values":rf_pred})
results_normalized.tail()

### Model output parameters comparison

In [None]:
results = [
    ["Logistic Regression", log_accuracy, log_spec, log_prec, log_sens, log_roc_auc_score, log_F1],
    ["XGBoost", xg_log_accuracy, xg_log_spec, xg_log_prec, xg_log_sens, xg_log_roc_auc_score, xg_log_F1],
    ["Decision Tree", dt_accuracy, dt_spec, dt_prec, dt_sens, dt_roc_auc_score, dt_F1],
    ["Random Forest", rf_accuracy, rf_spec, rf_prec, rf_sens, rf_roc_auc_score, rf_F1],
]

results_df = pd.DataFrame(results, columns=["Name", "Accuracy", "Specificity", "Precision", "Sensitivity", "ROC AUC Score", "F1 Score"])
results_df

### Visualizing AUROC Curve

In [None]:
from sklearn.metrics import roc_curve

# Calculate FPR, TPR, THRESHOLD for Logistic Regression Model 
log_pred_prob = log_reg.predict_proba(data_x_test)  # Build on our Test data
log_fpr, log_tpr, log_thre = roc_curve(data_y_test, log_pred_prob[:, 1])

# Calculate FPR, TPR, THRESHOLD for Decision Tree Model 
dt_pred_prob = dt.predict_proba(data_x_test)  # Build on our Test data
dt_fpr, dt_tpr, dt_thre = roc_curve(data_y_test, dt_pred_prob[:, 1])

# Calculate FPR, TPR, THRESHOLD for Random Forest Model 
rf_pred_prob = rf.predict_proba(data_x_test)  # Build on our Test data
rf_fpr, rf_tpr, rf_thre = roc_curve(data_y_test, rf_pred_prob[:, 1])

roc_plot_df = pd.DataFrame({
    "FPR": [log_fpr, dt_fpr, rf_fpr],
    "TPR": [log_tpr, dt_tpr, rf_tpr],
})


fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(log_fpr,log_tpr)
ax.plot(dt_fpr,dt_tpr)
ax.plot(rf_fpr,rf_tpr)

ax.grid(True)
ax.set_title('Fpr vs Tpr on the Attrition Dataset')
ax.legend(['XGBoost', 'Decision Tree', 'Random Forest'])
ax.xaxis.set_label_text('Fpr Value')
ax.yaxis.set_label_text('Tpr Value')

plt.show()


print("AUCROC Score using XGBoost: " + str(log_roc_auc_score))
print("AUCROC Score for Decision Tree: " + str(dt_roc_auc_score))
print("AUCROC Score for Random Forest: " + str(rf_roc_auc_score))

### Further comparison using Cross-Validation of scores

In [None]:
from sklearn.model_selection import cross_val_score
temp_df = []

################ Calculate Score for Logistic Regression
score_log_reg = cross_val_score(log_reg, data_x, data_y, scoring='accuracy', cv=5)


################ Calculate Score for XGBoost
score_xg_reg = cross_val_score(xg_model, data_x, data_y, scoring='accuracy', cv=5)


################ Calculate Score for Decision Tree
score_dt = cross_val_score(dt, data_x, data_y, scoring='accuracy', cv=5)


############### Calculate Score for Random Forest
score_rf = cross_val_score(rf, data_x, data_y, scoring='accuracy', cv=5)


# Create a Dataframe
results = [
    ["Logistic Regression", score_log_reg.mean(), score_log_reg.min(), score_log_reg.max()],
    ["XGBoost", score_xg_reg.mean(), score_xg_reg.min(), score_xg_reg.max()],
    ["Decision Tree", score_dt.mean(), score_dt.min(), score_dt.max()],
    ["Random Forest", score_rf.mean(), score_rf.min(), score_rf.max()],
]

results_df = pd.DataFrame(results, columns=["Name", "Mean Accuracy", "Minimum Accuracy", "Maximum Accuracy"])
results_df

### Conclusion: Logistic Regression using XGBoost seems to be the best fit