In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import zscore
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('emp_attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,YearsAtCompany
0,33,Yes,Travel_Frequently,Research & Development,3,Life Sciences,1,Male,3,1,Research Scientist,1,3348,1,Yes,11,3,10
1,32,Yes,Travel_Rarely,Sales,4,Medical,4,Male,1,3,Sales Executive,4,10400,1,No,11,3,14
2,40,Yes,Travel_Rarely,Research & Development,9,Life Sciences,4,Male,3,1,Laboratory Technician,1,2018,3,No,14,3,5
3,42,No,Travel_Rarely,Research & Development,7,Medical,2,Female,4,2,Research Scientist,2,2372,6,Yes,16,3,1
4,43,No,Travel_Frequently,Research & Development,27,Life Sciences,3,Female,3,3,Manufacturing Director,1,10820,8,No,11,3,8


In [3]:
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = pd.Categorical(df[i]).codes

In [4]:
def detect_outlier(col):
    Q1, Q3 = np.percentile(col, [25, 75])
    IQR = Q3 - Q1
    lr, ur = Q1 - 1.5*(IQR), Q3 + 1.5*(IQR)
    return lr, ur

In [5]:
for i in df.columns:
    for j in range(len(df)):
        if (df[i].loc[j] < detect_outlier(df[i])[0]) or (df[i].loc[j] > detect_outlier(df[i])[1]):
            df.drop(j, axis=0, inplace=True)
    df.reset_index(drop=True, inplace=True)

In [6]:
df['Age_Cat'] = ''

condlist = [
    (df['Age'] > 18) & (df['Age'] <= 30),
    (df['Age'] > 30) & (df['Age'] <= 40),
    (df['Age'] > 40) & (df['Age'] <= 50),
    (df['Age'] > 50) & (df['Age'] <= 60)
]
choicelist = ['18-30', '31-40', '41-50', '51-60']

df['Age_Cat'] = np.select(condlist, choicelist)

In [7]:
df['Age_Cat'] = pd.Categorical(df['Age_Cat']).codes
df.drop('Age', axis=1, inplace=True)

In [8]:
X = df.drop('Attrition', axis=1)
y = df[['Attrition']]

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_final.reset_index(drop=True, inplace=True)
X_test_final.reset_index(drop=True, inplace=True)
y_train_final.reset_index(drop=True, inplace=True)
y_test_final.reset_index(drop=True, inplace=True)

In [9]:
estimators = [DecisionTreeClassifier(), RandomForestClassifier()]

all_predictions = pd.DataFrame()
trained_models = []
kf = KFold(10, shuffle=True)

for i in range(len(estimators)):
    for train_index, test_index in kf.split(X_train_final):
        X_train, X_test = X_train_final.loc[train_index], X_train_final.loc[test_index]
        y_train, y_test = y_train_final.loc[train_index], y_train_final.loc[test_index]

        model = estimators[i]
        m = model.fit(X_train, y_train)
        trained_models.append(m)

        model_pred_train = pd.DataFrame(model.predict(X_train), index=train_index)
        model_pred_test = pd.DataFrame(model.predict(X_test), index=test_index)

        a = pd.concat([model_pred_train, model_pred_test], axis=0)
        all_predictions = pd.concat([all_predictions, a.sort_index()], axis=1)

all_predictions.columns = list(range(20))

In [19]:
all_predictions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1002,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1003,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [10]:
trained_models[0]

In [11]:
#X_train, X_test, y_train, y_test = train_test_split(all_predictions, y, test_size=0.3, random_state=42)

model19 = AdaBoostClassifier()
model19.fit(all_predictions, y_train_final)

#y_pred19 = pd.DataFrame(model19.predict(X_test), index=y_test.index, columns=['Attrition'])
#print('AUC =',roc_auc_score(y_test, y_pred19))

In [12]:
X_test_final.head()

Unnamed: 0,BusinessTravel,Department,DistanceFromHome,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,YearsAtCompany,Age_Cat
0,2,1,18,4,4,0,3,1,6,4,2693,1,0,19,3,1,1
1,0,2,29,3,2,1,1,2,7,1,4969,8,0,18,3,2,1
2,1,1,3,1,3,1,2,1,6,4,2853,0,1,11,3,0,1
3,1,2,9,3,4,1,1,2,7,4,4668,0,0,17,3,8,2
4,2,1,17,3,3,1,3,2,2,1,4558,1,0,12,3,10,1


In [13]:
test_predictions = pd.DataFrame(columns=list(range(20)))

dummy = []
for i in range(len(X_test_final)):
    for j in range(len(trained_models)):
        dummy.append(trained_models[j].predict(X_test_final[X_test_final.index == i])[0])
    test_predictions.loc[len(test_predictions.index)] = dummy
    dummy = []

In [14]:
test_predictions.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [15]:
y_pred19 = pd.DataFrame(model19.predict(test_predictions), columns=['Attrition'], index=y_test_final.index)

In [16]:
y_test_final

Unnamed: 0,Attrition
0,1
1,1
2,1
3,0
4,0
...,...
427,0
428,1
429,0
430,0


In [17]:
roc_auc_score(y_test_final, y_pred19)

0.9526741387907299

In [18]:
model19.score(test_predictions, y_test_final)

0.9537037037037037