In [None]:
# Setup

# common:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import matplotlib.patches as mpatches
from scipy.stats import norm
from scipy import stats
import time
import folium
import collections
import eli5 # Feature importance evaluation
import urllib
from PIL import Image

# for ML:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, average_precision_score, roc_curve, precision_recall_curve, classification_report, confusion_matrix, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, ShuffleSplit, cross_validate, cross_val_score, cross_val_predict, RandomizedSearchCV, GridSearchCV, learning_curve
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from xgboost import XGBClassifier

# Imported Libraries
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

# set some display options:
sns.set(style="whitegrid")
pd.set_option("display.max_columns", 36)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# load data:
file_path = '/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
pd.set_option("display.float_format", "{:.2f}".format)
df.describe()

In [None]:
# check for missing values
df.isnull().sum()

### To Drop:
+ EmployeeCount: All values have the same value.
+ EmployeeNumber: Irrelevant variable, it is only an employee identifier.
+ Over18: All values have the same value.
+ StandartHours: All values have the same value.

In [None]:
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)

In [None]:
df.Attrition = df.Attrition.astype('category').cat.codes

In [None]:
categorical_features = []
for column in df.columns:
    if df[column].dtype == object:
        categorical_features.append(column)
        print(f"{column}")
        print("====================================")

In [None]:
numerical_features = []
for column in df.columns:
    if df[column].dtype != object:
        numerical_features.append(column)
        print(f"{column}")
        print("====================================")

In [None]:
numerical_features.remove('Attrition')

# EDA

### Attrition rate

In [None]:
# The classes are skewed we need to solve this issue later.
print('No Attrition', round(df['Attrition'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Attrition', round(df['Attrition'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

In [None]:
sns.countplot(x='Attrition', data=df)
plt.title('Attrition Distributions \n (0: No Attrition || 1: Attrition)', fontsize=14)

In [None]:
plt.figure(figsize=(20, 40))

for i, feature in enumerate(numerical_features, 1):
    plt.subplot(8, 3, i)
    df[df["Attrition"] == 0][feature].hist(bins=35, color='blue', label='Not Attrition', alpha=0.6)
    df[df["Attrition"] == 1][feature].hist(bins=35, color='red', label='Attrition', alpha=0.6)
    plt.legend()
    plt.xlabel(feature)
    plt.ylabel('count')

In [None]:
plt.figure(figsize=(20, 15))

for i, feature in enumerate(categorical_features, 1):
    plt.subplot(3, 3, i)
    df[df["Attrition"] == 0][feature].hist(bins=35, color='blue', label='Not Attrition', alpha=0.6)
    df[df["Attrition"] == 1][feature].hist(bins=35, color='red', label='Attrition', alpha=0.6)
    plt.legend()
    plt.xlabel(feature)
    plt.ylabel('count')

**Conclusions:**

***
- `BusinessTravel` : The workers who travel a lot are more likely to quit than other employees.

- `Department` : The worker in `Research & Development` are more likely to stay than the workers on other departement.

- `EducationField` : The workers with `Human Resources` and `Technical Degree` are more likely to quit than employees from other fields of educations.

- `Gender` : The `Male` are more likely to quit.

- `JobRole` : The workers in `Laboratory Technician`, `Sales Representative`, and `Human Resources` are more likely to quit the workers in other positions.

- `MaritalStatus` : The workers who have `Single` marital status are more likely to quit the `Married`, and `Divorced`.

- `OverTime` : The workers who work more hours are likely to quit then others.

*** 

### Correlation Matrix

In [None]:
plt.figure(figsize=(30, 24))
palette = sns.diverging_palette(20, 220, n=256)
corr=df.corr(method='pearson')
sns.heatmap(corr, annot=True, cmap=palette, vmax=.3, center=0, square=True, linewidths=.5, annot_kws={"size":15}, cbar_kws={'shrink': .5})
plt.title('Correlation Matrix', size=15, weight='bold')

In [None]:
df.drop('Attrition', axis=1).corrwith(df.Attrition).plot(kind='barh', figsize=(10, 7))

# ML

## Normal

In [None]:
# Separate features and predicted value
features = numerical_features + categorical_features
Y = df['Attrition']
X = df.drop('Attrition', axis=1)[features]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# preprocess numerical feats:
# for most num cols, except the dates, 0 is the most logical choice as fill value
# and here no dates are missing.
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ('scaler', StandardScaler())])

# Preprocessing for categorical features:
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical features:
preprocessor = ColumnTransformer(transformers=[("num", num_transformer, numerical_features),
                                               ("cat", cat_transformer, categorical_features)])

In [None]:
# define base_models to test:
base_models = {
    'LOR_model': LogisticRegression(),
    'KNC_model': KNeighborsClassifier(),
    'SVM_model': SVC(),
    'DTR_model': DecisionTreeClassifier(),
    'RFC_model': RandomForestClassifier(),
    'ETC_model': ExtraTreesClassifier(),
    'BAG_model': BaggingClassifier(),
    'MLP_model': MLPClassifier(),
    'XGB_model': XGBClassifier(),
}

normal_model_score = {}

# split data into 'kfolds' parts for cross validation,
# use shuffle to ensure random distribution of data:
kfolds = 4 # 4 = 75% train, 25% validation
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

# Preprocessing, fitting, making predictions and scoring for every model:
for name, model in base_models.items():
    # pack preprocessing of data and the model in a pipeline:
    model_steps = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])
    
    # get cross validation score for each model:
    cv_results = cross_val_score(model_steps, 
                                 X_train, Y_train, 
                                 cv=split,
                                 scoring="accuracy",
                                 n_jobs=-1)
    normal_model_score[name] = cv_results
    
    # output:
    min_score = round(min(cv_results), 4)
    max_score = round(max(cv_results), 4)
    mean_score = round(np.mean(cv_results), 4)
    std_dev = round(np.std(cv_results), 4)
    print(f"{name} model cross validation accuracy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")

## Under Sampling

In [None]:
# Since our classes are highly skewed we should make them equivalent in order to have a normal distribution of the classes.

# Lets shuffle the data before creating the subsamples

df = df.sample(frac=1, random_state=42)

# amount of fraud classes 492 rows.
fraud_df = df.loc[df['Attrition'] == 1]
non_fraud_df = df.loc[df['Attrition'] == 0][:237]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)

new_df.head()

In [None]:
# Separate features and predicted value
features = numerical_features + categorical_features
Y = normal_distributed_df['Attrition']
X = normal_distributed_df.drop('Attrition', axis=1)[features]

under_X_train, under_X_test, under_Y_train, under_Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# preprocess numerical feats:
# for most num cols, except the dates, 0 is the most logical choice as fill value
# and here no dates are missing.
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ('scaler', StandardScaler())])

# Preprocessing for categorical features:
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical features:
preprocessor = ColumnTransformer(transformers=[("num", num_transformer, numerical_features),
                                               ("cat", cat_transformer, categorical_features)])

In [None]:
# define base_models to test:
base_models = {
    'LOR_model': LogisticRegression(),
    'KNC_model': KNeighborsClassifier(),
    'SVM_model': SVC(),
    'DTR_model': DecisionTreeClassifier(),
    'RFC_model': RandomForestClassifier(),
    'ETC_model': ExtraTreesClassifier(),
    'BAG_model': BaggingClassifier(),
    'MLP_model': MLPClassifier(),
    'XGB_model': XGBClassifier(),
}

under_sampling_model_score = {}

# split data into 'kfolds' parts for cross validation,
# use shuffle to ensure random distribution of data:
kfolds = 4 # 4 = 75% train, 25% validation
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

# Preprocessing, fitting, making predictions and scoring for every model:
for name, model in base_models.items():
    # pack preprocessing of data and the model in a pipeline:
    model_steps = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])
    
    # get cross validation score for each model:
    cv_results = cross_val_score(model_steps, 
                                 under_X_train, under_Y_train, 
                                 cv=split,
                                 scoring="accuracy",
                                 n_jobs=-1)
    under_sampling_model_score[name] = cv_results

    # output:
    min_score = round(min(cv_results), 4)
    max_score = round(max(cv_results), 4)
    mean_score = round(np.mean(cv_results), 4)
    std_dev = round(np.std(cv_results), 4)
    print(f"{name} model cross validation accuracy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")

## Over sampling

In [None]:
# Separate features and predicted value
features = numerical_features + categorical_features
Y = df['Attrition']
X = df.drop('Attrition', axis=1)[features]

over_X_train, over_X_test, over_Y_train, over_Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# preprocess numerical feats:
# for most num cols, except the dates, 0 is the most logical choice as fill value
# and here no dates are missing.
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ('scaler', StandardScaler())])

# Preprocessing for categorical features:
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical features:
preprocessor = ColumnTransformer(transformers=[("num", num_transformer, numerical_features),
                                               ("cat", cat_transformer, categorical_features)])

# SMOTE Technique (OverSampling) After splitting and Cross Validating
sm = SMOTE(sampling_strategy='minority', random_state=42)

In [None]:
# define base_models to test:
base_models = {
    'LOR_model': LogisticRegression(),
    'KNC_model': KNeighborsClassifier(),
    'SVM_model': SVC(),
    'DTR_model': DecisionTreeClassifier(),
    'RFC_model': RandomForestClassifier(),
    'ETC_model': ExtraTreesClassifier(),
    'BAG_model': BaggingClassifier(),
    'MLP_model': MLPClassifier(),
    'XGB_model': XGBClassifier(),
}

over_sampling_model_score = {}

# split data into 'kfolds' parts for cross validation,
# use shuffle to ensure random distribution of data:
kfolds = 4 # 4 = 75% train, 25% validation
split = KFold(n_splits=kfolds, shuffle=True, random_state=42)

# Preprocessing, fitting, making predictions and scoring for every model:
for name, model in base_models.items():
    # pack preprocessing of data and the model in a pipeline:
    model_steps = Pipeline(steps=[('preprocessor', preprocessor),
                              ('smote', sm),
                              ('model', model)])
    
    # get cross validation score for each model:
    cv_results = cross_val_score(model_steps, 
                                 over_X_train, over_Y_train, 
                                 cv=split,
                                 scoring="accuracy",
                                 n_jobs=-1)
    over_sampling_model_score[name] = cv_results

    # output:
    min_score = round(min(cv_results), 4)
    max_score = round(max(cv_results), 4)
    mean_score = round(np.mean(cv_results), 4)
    std_dev = round(np.std(cv_results), 4)
    print(f"{name} model cross validation accuracy score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")
    

In [None]:
model_score = {
    "normal_model_score": normal_model_score,
    "under_sampling_model_score": under_sampling_model_score,
    "over_sampling_model_score": over_sampling_model_score
}

figure = plt.figure(figsize=(15,12))
for name, score_dict in model_score.items():
    mean_score = []
    lower_mean_socre = []
    upper_mean_socre = []
    model_name = []
    for model, score in score_dict.items():
        mean_score.append(round(np.mean(score), 4))
        lower_mean_socre.append(round(np.mean(score), 4) - round(np.std(score), 4))
        upper_mean_socre.append(round(np.mean(score), 4) + round(np.std(score), 4))
        model_name.append(model)
    plt.plot(model_name, mean_score, 'o-', label=f"{name}")
    plt.fill_between(model_name, lower_mean_socre, upper_mean_socre, alpha=0.1)
plt.title("Sampling Score Curve", fontsize=14)
plt.xlabel('model name')
plt.ylabel('Score')
plt.grid(True)
plt.legend(loc="best")

So, we select the over sampling method.

In [None]:
print("Normal Model\n")
model = RandomForestClassifier(random_state=42, n_jobs=-1,)

model_steps = Pipeline(steps=[
                            ('preprocessor', preprocessor),
                            ('model', model)])

# fit model(pipeline) so values can be accessed:
model_steps.fit(X_train, Y_train)

Y_pred = model_steps.predict(X_test)
ActVPred = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(ActVPred)

labels = ['No Attrition', 'Attrition']
print(classification_report(Y_test, Y_pred, target_names=labels))

In [None]:
print("Over Sampling Model\n")
model = RandomForestClassifier(random_state=42, n_jobs=-1,)

model_steps = Pipeline(steps=[
                            ('preprocessor', preprocessor),
                            ('model', model)])

# fit model(pipeline) so values can be accessed:
model_steps.fit(over_X_train, over_Y_train)

over_Y_pred = model_steps.predict(over_X_test)
ActVPred = pd.DataFrame({'Actual': over_Y_test, 'Predicted': over_Y_pred})
print(ActVPred)

labels = ['No Attrition', 'Attrition']
print(classification_report(over_Y_test, over_Y_pred, target_names=labels))

In [None]:
# Names of all (encoded) features are needed.
# Get names of columns from One Hot Encoding:
onehot_columns = list(model_steps.named_steps['preprocessor'].
                      named_transformers_['cat'].
                      named_steps['onehot'].
                      get_feature_names(input_features=categorical_features))

# Add num_features for full list.
# Order must be as in definition of X, where num_features are first: 
feat_imp_list = numerical_features + onehot_columns

# show 10 most important features, provide names of features:
feat_imp_df = eli5.formatters.as_dataframe.explain_weights_df(
    model_steps.named_steps['model'],
    feature_names=feat_imp_list)
feat_imp_df.head(10)