In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.model_selection import cross_validate

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

## Read csv

In [None]:
at = pd.read_csv(r"../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
at.shape

In [None]:
at.head()

In [None]:
at.shape

In [None]:
at.describe()

## Data Preprocessing

In [None]:
# No missing values
at.info()

In [None]:
at.isnull().sum().sum()

In [None]:
# Highly imbalanced
at.Attrition.value_counts()

In [None]:
# Checking for Outliers in numeric columns
lst = ['Age','DailyRate','DistanceFromHome','HourlyRate','MonthlyIncome','MonthlyRate']
for i in lst:
    sns.boxplot(at[i])
    plt.show()

In [None]:
Q1 = np.quantile(at.MonthlyIncome, [0.25])
Q3 = np.quantile(at.MonthlyIncome, [0.75])
IQR = Q3 - Q1
at[at.MonthlyIncome>16581].shape

In [None]:
# Monthly income column contains outliers
# Applying log to monthly income
at['MonthlyIncome'] = np.log(at['MonthlyIncome'])
sns.boxplot(np.log(at['MonthlyIncome']))

### Dropping Zero Variance Predictors

In [None]:
at.EmployeeCount.value_counts()

In [None]:
at.Over18.value_counts()

In [None]:
at.StandardHours.value_counts()

In [None]:
at.drop(columns = ['EmployeeCount','EmployeeNumber','Over18','StandardHours'] , inplace = True)

## Feature Engineering

**1. Binning**

In [None]:
at['Age_bins'] = pd.qcut(at.Age, [0,0.25,0.50,0.75,1.0], labels= ['Young','Adults','Middle_Aged','Old']).astype(object)
at['Distance_bins'] = pd.qcut(at.DistanceFromHome, [0,0.25,0.50,0.75,1.0], labels= ['VeryClose','Medium','Far','VeryFar']).astype(object)
at['Experiance_bins'] = pd.cut(at.TotalWorkingYears, [0,1,5,10,15,40], labels = ['Freshers','Associate','SnAssociate','Lead','SnLead']).astype(object)
at['Promotion_bins'] = pd.qcut(at.YearsSinceLastPromotion, [0,0.50,0.75,1.0], labels = ['0_1','2_3','3_15']).astype(object)

**2. Feature Encoding**

In [None]:
at_x = at.loc[:,at.columns!='Attrition']
at_y = at.loc[:,'Attrition']

In [None]:
numcols = at_x.select_dtypes(include = "int64").columns
factcols = at_x.select_dtypes(include = "object").columns

In [None]:
# Label Encoding

# le = LabelEncoder()
# at_x[factcols] = at_x[factcols].apply(le.fit_transform)

In [None]:
# One hot encoding

at_x = pd.get_dummies(at_x , columns = at_x.select_dtypes(include = 'object').columns)
at_x.head()

In [None]:
at1 = at_x.copy()

## Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

In [None]:
# feat_IG = pd.DataFrame({'Features':at_x.columns,'Information_Gain':mutual_info_classif(at_x,at_y,random_state = 100)}).sort_values('Information_Gain',ascending = False)
# list(feat_IG.iloc[0:25,:]['Features'])

In [None]:
# SelectKBest

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

test = SelectKBest(score_func = chi2 , k='all')
fited = test.fit(at_x,at_y)
fi1 = pd.DataFrame({'Features':at_x.columns, 'Feature_Imp':fited.scores_}).sort_values('Feature_Imp',ascending = False).set_index('Features')
fi1

In [None]:
# Recurring Feature Elimination

from sklearn.feature_selection import RFE

from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()

rfe = RFE(ada,  20)
rfe.fit(at_x, at_y)

feat = pd.DataFrame({'Features':list(at_x.columns),'Feature_relevance_ranking':list(rfe.ranking_)}).sort_values('Feature_relevance_ranking',ascending = False).set_index('Features')
feat

In [None]:
# Final: SelectKBest & RFE
# Selecting common irrelevant features. Criteria:- In selectKBest, score< 2 & in RFE, Rank>2
set( fi1[fi1.Feature_Imp<2].index & feat[feat.Feature_relevance_ranking>1 ].index )

In [None]:
# Only SelectKBest and RFE
final_feat = list(at_x.columns)

for name in ['Age_bins_Adults','BusinessTravel_Travel_Rarely','Department_Human Resources','Distance_bins_Far',
             'Distance_bins_Medium','Education','EducationField_Human Resources','EducationField_Life Sciences',
             'EducationField_Other','Experiance_bins_Lead','Experiance_bins_SnAssociate','Gender_Female','Gender_Male',
             'JobRole_Human Resources','JobRole_Research Scientist','JobRole_Sales Executive','PercentSalaryHike',
             'PerformanceRating','Promotion_bins_0_1','Promotion_bins_2_3','RelationshipSatisfaction']:
    final_feat.remove(name)
print(final_feat)

### Train Test Split

In [None]:
at['Attrition'].replace({'No':0,'Yes':1},inplace = True)

In [None]:
at_x = at_x[final_feat]

In [None]:
at_train_x , at_test_x, at_train_y , at_test_y = train_test_split(at_x , at_y , test_size = 0.3 , stratify = at_y, random_state = 100)

### SmoteTomek

In [None]:
from collections import Counter
Counter(at_y)

In [None]:
from imblearn.combine import SMOTETomek

os=SMOTETomek(0.40)
X_train_ns,y_train_ns=os.fit_resample(at_train_x,at_train_y)
print("The number of classes before fit {}".format(Counter(at_train_y)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

# Model Building

In [None]:
acc = []; prec = []; recall = [] ; f1 = []
models = ['Logistic','DecisionTree','RandomForest','AdaBoost','Xgboost','NaiveBayes','SVM','VotingClassifier']

In [None]:
at_train_x.head()

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(solver='liblinear')
logmodel.fit(at_train_x, at_train_y)
valid_pred = logmodel.predict(at_test_x) ; train_pred = logmodel.predict(at_train_x)

acc.append(round(accuracy_score(valid_pred,at_test_y),2)) ; train_score = accuracy_score(train_pred,at_train_y)
f1.append(round(f1_score(valid_pred,at_test_y),2))
prec.append(round(precision_score(valid_pred,at_test_y),2))
recall.append(round(recall_score(valid_pred,at_test_y),2))

print("*******For test data********")
print(confusion_matrix(valid_pred,at_test_y))
print(classification_report(valid_pred,at_test_y))
print("******For train data*******")
print(classification_report(train_pred,at_train_y))

In [None]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(max_depth = 12)
dtree.fit(at_train_x, at_train_y)
valid_pred = dtree.predict(at_test_x) ; train_pred = dtree.predict(at_train_x)
train_score = accuracy_score(train_pred,at_train_y)

acc.append(round(accuracy_score(valid_pred,at_test_y),2))
f1.append(round(f1_score(valid_pred,at_test_y),2))
prec.append(round(precision_score(valid_pred,at_test_y),2))
recall.append(round(recall_score(valid_pred,at_test_y),2))

print("*******For test data********")
print(confusion_matrix(valid_pred,at_test_y))
print(classification_report(valid_pred,at_test_y))
print("******For train data*******")
print(classification_report(train_pred,at_train_y))

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(at_train_x, at_train_y)
valid_pred = rf.predict(at_test_x) ; train_pred = rf.predict(at_train_x)
train_score = accuracy_score(train_pred,at_train_y)

acc.append(round(accuracy_score(valid_pred,at_test_y),2))
f1.append(round(f1_score(valid_pred,at_test_y),2))
prec.append(round(precision_score(valid_pred,at_test_y),2))
recall.append(round(recall_score(valid_pred,at_test_y),2))

print("*******For test data********")
print(confusion_matrix(valid_pred,at_test_y))
print(classification_report(valid_pred,at_test_y))
print("******For train data*******")
print(classification_report(train_pred,at_train_y))

In [None]:
# Adaboost Classifier
from sklearn.ensemble import AdaBoostClassifier
ad = AdaBoostClassifier()
ad.fit(at_train_x, at_train_y)
valid_pred = ad.predict(at_test_x) ; train_pred = ad.predict(at_train_x)
train_score = accuracy_score(train_pred,at_train_y)

acc.append(round(accuracy_score(valid_pred,at_test_y),2))
f1.append(round(f1_score(valid_pred,at_test_y),2))
prec.append(round(precision_score(valid_pred,at_test_y),2))
recall.append(round(recall_score(valid_pred,at_test_y),2))

print("*******For test data********")
print(confusion_matrix(valid_pred,at_test_y))
print(classification_report(valid_pred,at_test_y))
print("******For train data*******")
print(classification_report(train_pred,at_train_y))

In [None]:
# Adaboost Classifier after SmoteTomek
from sklearn.ensemble import AdaBoostClassifier
ad = AdaBoostClassifier()
ad.fit(X_train_ns,y_train_ns)
valid_pred = ad.predict(at_test_x) ; train_pred = ad.predict(X_train_ns)
train_score = accuracy_score(train_pred,y_train_ns)

print("*******For test data********")
print(confusion_matrix(valid_pred,at_test_y))
print(classification_report(valid_pred,at_test_y))
print("******For train data*******")
print(classification_report(train_pred,y_train_ns))

**After performing oversampling, f1 score dropped to 0.51 as opposed to f1 score of 0.62 in without oversampling case.**

In [None]:
# Xgboost
from xgboost import XGBClassifier
xgc = XGBClassifier()
xgc.fit(at_train_x, at_train_y)
valid_pred = xgc.predict(at_test_x) ; train_pred = xgc.predict(at_train_x)
train_score = accuracy_score(train_pred,at_train_y)

acc.append(round(accuracy_score(valid_pred,at_test_y),2))
f1.append(round(f1_score(valid_pred,at_test_y),2))
prec.append(round(precision_score(valid_pred,at_test_y),2))
recall.append(round(recall_score(valid_pred,at_test_y),2))

print("*******For test data********")
print(confusion_matrix(valid_pred,at_test_y))
print(classification_report(valid_pred,at_test_y))
print("******For train data*******")
print(classification_report(train_pred,at_train_y))

In [None]:
# Naive Bayes

from sklearn.naive_bayes import MultinomialNB

naive_bay = MultinomialNB()
naive_bay.fit(at_train_x, at_train_y)
valid_pred = naive_bay.predict(at_test_x) ; train_pred = naive_bay.predict(at_train_x)
train_score = accuracy_score(train_pred,at_train_y)

acc.append(round(accuracy_score(valid_pred,at_test_y),2))
f1.append(round(f1_score(valid_pred,at_test_y),2))
prec.append(round(precision_score(valid_pred,at_test_y),2))
recall.append(round(recall_score(valid_pred,at_test_y),2))

print("*******For test data********")
print(confusion_matrix(valid_pred,at_test_y))
print(classification_report(valid_pred,at_test_y))
print("******For train data*******")
print(classification_report(train_pred,at_train_y))

In [None]:
# SVM

from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(at_train_x, at_train_y)
valid_pred = svc_model.predict(at_test_x) ; train_pred = svc_model.predict(at_train_x)
train_score = accuracy_score(train_pred,at_train_y)

acc.append(round(accuracy_score(valid_pred,at_test_y),2))
f1.append(round(f1_score(valid_pred,at_test_y),2))
prec.append(round(precision_score(valid_pred,at_test_y),2))
recall.append(round(recall_score(valid_pred,at_test_y),2))

print("*******For test data********")
print(confusion_matrix(valid_pred,at_test_y))
print(classification_report(valid_pred,at_test_y))
print("******For train data*******")
print(classification_report(train_pred,at_train_y))

In [None]:
# Voting Classifier

from sklearn.ensemble import VotingClassifier



dtree = DecisionTreeClassifier()
rfc = RandomForestClassifier()
ada = AdaBoostClassifier()
xgc = XGBClassifier()
naive_bay = MultinomialNB()


vc = VotingClassifier(estimators=[("xgc",xgc),("ada",ada)])

vc.fit(at_train_x, at_train_y)
valid_pred = vc.predict(at_test_x) ; train_pred = vc.predict(at_train_x)
train_score = accuracy_score(train_pred,at_train_y)

acc.append(round(accuracy_score(valid_pred,at_test_y),2))
f1.append(round(f1_score(valid_pred,at_test_y),2))
prec.append(round(precision_score(valid_pred,at_test_y),2))
recall.append(round(recall_score(valid_pred,at_test_y),2))

print("*******For test data********")
print(confusion_matrix(valid_pred,at_test_y))
print(classification_report(valid_pred,at_test_y))
print("******For train data*******")
print(classification_report(train_pred,at_train_y))

In [None]:
compare = pd.concat([pd.Series(models),pd.Series(acc),pd.Series(prec),pd.Series(recall),pd.Series(f1)]
                    ,axis=1)
compare.columns = ['Models','Accuracy','Precision','Recall','f1']
compare

## Comparing Baseline Models

In [None]:
plt.subplots(figsize=(12,8))
plt.plot(compare.Models,compare.Accuracy,marker = '.')
plt.plot(compare.Models,compare.Precision,marker = '.')
plt.plot(compare.Models,compare.Recall,marker = '.')
plt.plot(compare.Models,compare.f1,marker = '.')
plt.legend(('Accuracy','Precision','Recall','f1'))

for x,y in zip(compare.Models,compare.Accuracy):

    label = "{:.2f}".format(y)

    plt.annotate(label, # this is the text
                 (x,y), # these are the coordinates to position the label
                 textcoords="offset points",
                xytext=(0,2)) 
    
for x,y in zip(compare.Models,compare.Precision):

    label = "{:.2f}".format(y)

    plt.annotate(label, # this is the text
                 (x,y), # these are the coordinates to position the label
                 textcoords="offset points",
                xytext=(0,2))

for x,y in zip(compare.Models,compare.Recall):

    label = "{:.2f}".format(y)

    plt.annotate(label, # this is the text
                 (x,y), # these are the coordinates to position the label
                 textcoords="offset points",
                xytext=(0,2)) 

for x,y in zip(compare.Models,compare.f1):

    label = "{:.2f}".format(y)

    plt.annotate(label, # this is the text
                 (x,y), # these are the coordinates to position the label
                 textcoords="offset points",
                xytext=(0,2)) 


plt.xlabel('Models')
plt.ylabel('Metrics')
plt.show()

**We can infer that among all baseline models, Adaboost is performing well in terms of precision,recall and f1.**

## Hyperparameter tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [None]:
## HyperParameter Optimization

# Grid Search CV

grid = {'n_estimators':[50,100,150,200,300],'learning_rate':[0.1, 1, 1.1, 1.2, 1.3, 1.4]}

ada = AdaBoostClassifier()
cv = StratifiedKFold(n_splits=5,shuffle = True,random_state = 100)
scorer = make_scorer(f1_score)
from sklearn.model_selection import GridSearchCV

clf=GridSearchCV(estimator = ada, param_grid = grid, cv=cv, n_jobs=-1,scoring = scorer)
grid_result = clf.fit(at_train_x, at_train_y)

In [None]:
best_grid=clf.best_params_
best_grid

In [None]:
clf.best_score_

In [None]:
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Stratified KFold Cross Validation

In [None]:
# Stratified KFold Cross Validation
from sklearn.model_selection import StratifiedKFold
Kfolds = StratifiedKFold(n_splits=5,shuffle = True , random_state = 100)
i = 1
accuracy = []
f1_score_class1 = []
test_acc = []
test_f1 = []


for train_index,val_index in Kfolds.split(at_x, at_y):
    train_x , val_x = at_x.iloc[train_index], at_x.iloc[val_index]
    train_y , val_y = at_y.iloc[train_index], at_y.iloc[val_index]
    
    ad = AdaBoostClassifier(learning_rate= 1.4, n_estimators = 50)
    ad.fit(train_x, train_y)
    valid_pred = ad.predict(val_x) ; train_pred = ad.predict(train_x)
    test_pred = ad.predict(at_test_x)
    train_score = accuracy_score(train_pred,train_y)
    valid_score = accuracy_score(valid_pred,val_y) ; train_score = accuracy_score(train_pred, train_y)
    
    print("For CV = ",i)
    print(confusion_matrix(valid_pred,val_y))
    print("\nF1_Score:",f1_score(valid_pred,val_y))
    print("\nAccuracy for Validation:",valid_score,'||',"\tAccuracy for Train:",train_score)    
    print("\nClassification Report\n",classification_report(valid_pred,val_y))
    print("\n***************")
    i = i+1
    accuracy.append(valid_score) ; f1_score_class1.append(f1_score(valid_pred,val_y))
    test_acc.append(accuracy_score(test_pred,at_test_y)); test_f1.append(f1_score(test_pred,at_test_y))
    
print("Mean Accuracy",np.mean(accuracy),"\nMean F1_Score:",np.mean(f1_score_class1),"\nMean Test Accuracy",np.mean(test_acc)
     ,"\nMean Test F1score",np.mean(test_f1))

### Conclusion:

- After tuning hyperparameters of AdaBoost Classifier model, f1-score of 0.62 on test set for baseline model increased to 0.67.
- Final Accuracy and f1-score after performing Stratified KFold CV is 0.86 & 0.513 respectively.