In [None]:
## importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#reading dataset
df = pd.read_csv(r'../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
len(categorical_features)

In [None]:
numerical_features = [feature for feature in df.columns if feature not in categorical_features]
len(numerical_features)

In [None]:
#Checking value counts for categorical variables
for col in categorical_features:
    print(df[col].value_counts())   

In [None]:
#Drop column Over18 as all values are Yes
df.drop('Over18', axis = 1, inplace = True)

In [None]:
#Checking number of distinct values for numerical variables
for col in numerical_features:
    print(col, df[col].nunique())

In [None]:
#Drop these as there are only 1 type of value in whole variable
df.drop(['EmployeeCount','StandardHours'], axis = 1, inplace = True)

In [None]:
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
len(categorical_features)

In [None]:
numerical_features = [feature for feature in df.columns if feature not in categorical_features]
len(numerical_features)

In [None]:
# getting list of discrete numerical features
discrete_numerical_features = []
for col in numerical_features:
    if (df[col].nunique()<11):
        discrete_numerical_features.append(col)

In [None]:
len(discrete_numerical_features)

In [None]:
numerical_features = [feature for feature in numerical_features if feature not in discrete_numerical_features]
len(numerical_features)

In [None]:
df.describe()

In [None]:
df.describe(include = ['O'])

In [None]:
Attrition_mapping = {"Yes": 1, "No": 0}
df['Attrition'] = df['Attrition'].map(Attrition_mapping)

In [None]:
sns.countplot(df['Attrition'])

In [None]:
attrition = df[(df['Attrition'] != 0)]
no_attrition = df[(df['Attrition'] == 0)]
print('Percentage of Attrition: {}'.format(len(attrition)/len(df)))

In [None]:
df[['Gender', 'Attrition']].groupby(['Gender'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

Males have higher attrition rate than woman

In [None]:
df[['BusinessTravel', 'Attrition']].groupby(['BusinessTravel'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

People travel more frequently have much higher attrition rate than others

In [None]:
df[['Department', 'Attrition']].groupby(['Department'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

Sales and HR dept have higher attrition rate than R&D

In [None]:
df[['EducationField', 'Attrition']].groupby(['EducationField'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

In [None]:
df[['JobRole', 'Attrition']].groupby(['JobRole'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

Sales Representatives have higher attrition rate

In [None]:
df[['MaritalStatus', 'Attrition']].groupby(['MaritalStatus'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

People with Marital Status as Single have much higher chance of attrition

In [None]:
df[['OverTime', 'Attrition']].groupby(['OverTime'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

People working over time have higher attrition rate

In [None]:
sns.set_style('whitegrid')
sns.distplot(df['Age'], bins = 10)

In [None]:
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'Age', bins=15)

People with age 28-34 have higher attrition rate

In [None]:
sns.distplot(df['MonthlyIncome'], bins = 15)

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'MonthlyIncome', bins=15)

People having Monthly Income greater than 10000 have rarest chance of leaving a company and may be considered as outliers

In [None]:
sns.distplot(df['DistanceFromHome'], bins = 15)

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'DistanceFromHome', bins=15)

Discuss with Nitin Sir

In [None]:
sns.distplot(df['DailyRate'])

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'DailyRate', bins=15)

In [None]:
sns.boxplot(df['Attrition'],df['DailyRate'])

In [None]:
sns.distplot(df['MonthlyRate'])

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'MonthlyRate', bins=15)

In [None]:
sns.boxplot(df['Attrition'],df['MonthlyRate'])

In [None]:
sns.distplot(df['HourlyRate'])

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'HourlyRate', bins=15)

In [None]:
sns.boxplot(df['Attrition'],df['HourlyRate'])

Doesn't have much significance as it's almost uniform attrition at different daily, monthly and hourly rate - these are just features extracted from salary itself

In [None]:
sns.distplot(df['PercentSalaryHike'])

In [None]:
sns.boxplot(df['Attrition'],df['PercentSalaryHike'])

This column have similar distribution so, can be eliminated

In [None]:
numerical_features

In [None]:
sns.distplot(df['YearsAtCompany'])

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'YearsAtCompany', bins=15)

Freshers at particular company have higher attrition rate

In [None]:
sns.boxplot(df['Attrition'],df['YearsAtCompany'])

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'YearsInCurrentRole', bins=15)

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'YearsSinceLastPromotion', bins=15)

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'YearsWithCurrManager', bins=15)

In [None]:
sns.set_style('whitegrid')
g = sns.FacetGrid(df, col='Attrition')
g.map(plt.hist, 'TotalWorkingYears', bins=15)

In [None]:
grid = sns.FacetGrid(df, col='Attrition', row='MaritalStatus', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.8, bins=15)
grid.add_legend();

People with Marital Status as 'Single' and too within age group of 28-34 has higher attrition rate

In [None]:
df[['Education', 'Attrition']].groupby(['Education'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

Bachelors and below bachelors have higher attrition rate

In [None]:
df[['EnvironmentSatisfaction', 'Attrition']].groupby(['EnvironmentSatisfaction'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

In [None]:
df[['JobInvolvement', 'Attrition']].groupby(['JobInvolvement'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

In [None]:
df[['JobLevel', 'Attrition']].groupby(['JobLevel'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

In [None]:
df[['JobSatisfaction', 'Attrition']].groupby(['JobSatisfaction'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

Environmental Satisfaction, Job Satisfaction, Job Level Plays Huge role in Attrition Rate of the Employees

In [None]:
df[['NumCompaniesWorked', 'Attrition']].groupby(['NumCompaniesWorked'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

In [None]:
sns.countplot(df['NumCompaniesWorked'], hue = df['Attrition'])

Discuss With Nitin Sir

In [None]:
df[['PerformanceRating', 'Attrition']].groupby(['PerformanceRating'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

This feature can be eliminated as they have equal attrition and only 2 ratings

In [None]:
df[['RelationshipSatisfaction', 'Attrition']].groupby(['RelationshipSatisfaction'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

In [None]:
df[['StockOptionLevel', 'Attrition']].groupby(['StockOptionLevel'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

In [None]:
df[['TrainingTimesLastYear', 'Attrition']].groupby(['TrainingTimesLastYear'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

In [None]:
df[['WorkLifeBalance', 'Attrition']].groupby(['WorkLifeBalance'], as_index=False).mean().sort_values(by='Attrition', ascending=False)

In [None]:
numerical_features

In [None]:
plt.figure(figsize = (12,6))
sns.countplot(df['TotalWorkingYears'], hue = df['Attrition'])

In [None]:
#Drop these as there are only 1 type of value in whole variable
df.drop(['HourlyRate', 'MonthlyRate','DailyRate','PerformanceRating'], axis = 1, inplace = True)

In [None]:
from scipy.stats import norm, skew
numerical_features.remove('EmployeeNumber')
numerical_features.remove('HourlyRate')
numerical_features.remove('MonthlyRate')
numerical_features.remove('DailyRate')
skewed_feat = df[numerical_features].apply(lambda x: skew(x.dropna())).sort_values(ascending = False)
skewness = pd.DataFrame({'Skew' :skewed_feat})
skewness.head(10)

In [None]:
skewness = skewness[abs(skewness) > 0.75]
from scipy.special import boxcox1p
skewed_features = skewness.index
lamda = 0.15
for feat in skewed_features:
    df[feat] = boxcox1p(df[feat],lamda)

In [None]:
df['New_feature'] = (df['Gender'].astype(str) + '_' + df['MaritalStatus'].astype(str))

In [None]:
df.drop(['Gender', 'MaritalStatus'], axis = 1, inplace = True)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
#Checking correaltions between variables
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})
k_corr_matrix1 =df.corr()
plt.figure(figsize=(20,14))
sns.heatmap(k_corr_matrix1, annot=True, cmap=plt.cm.RdBu_r)
plt.title('Heatmap for Correlation between Features')

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(df, 0.7)
len(set(corr_features))

In [None]:
corr_features

Removed job level instead of Monthly Income as both are highly correlated and any one of them can be dropeed

In [None]:
corr_features.remove('MonthlyIncome')
corr_features.remove('TotalWorkingYears')

In [None]:
corr_features.update(['JobLevel'])

In [None]:
corr_features

In [None]:
df.drop(corr_features, axis=1, inplace = True)

In [None]:
df.shape

In [None]:
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
len(categorical_features)

In [None]:
#Label-Encoding ordinal categorical features 
from sklearn.preprocessing import LabelEncoder
for c in categorical_features:
    lbl = LabelEncoder() 
    lbl.fit(list(df[c].values)) 
    df[c] = lbl.transform(list(df[c].values))

# shape        
print('Shape all_data: {}'.format(df.shape))

In [None]:
from sklearn.model_selection import train_test_split
Id_train = df['EmployeeNumber']
X = df.drop(['Attrition', 'EmployeeNumber'], axis = 1)
y = df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
cv = StratifiedKFold(n_splits = 5, random_state = None, shuffle = False)

In [None]:
# Using random forest on balanced dataset
rf = RandomForestClassifier()
param_grid=dict(n_estimators= [120, 300, 500, 800, 1200],max_depth=range(1,20), min_samples_split = [1, 2, 5, 10, 15, 100],
               min_samples_leaf = [1,2,5,10], max_features = ['log2', 'sqrt', None])
grid_rf = RandomizedSearchCV(rf, param_grid, cv=cv, scoring = 'f1_macro')
grid_rf.fit(X_train,y_train)

In [None]:
# Check out best parameters and best score
print(grid_rf.best_score_)
print(grid_rf.best_params_)

In [None]:
# Using random forest on balanced dataset
rf = RandomForestClassifier(class_weight={0:1,1:5}, random_state = 42)
param_grid=dict(n_estimators= [120, 300, 500, 800, 1200],max_depth=range(1,20), min_samples_split = [1, 2, 5, 10, 15, 100],
               min_samples_leaf = [1,2,5,10], max_features = ['log2', 'sqrt', None])
grid_rf = RandomizedSearchCV(rf, param_grid, cv=cv, scoring = 'f1_macro')
grid_rf.fit(X_train,y_train)

In [None]:
# Check out best parameters and best score
print(grid_rf.best_score_)
print(grid_rf.best_params_)

In [None]:
import scikitplot as skplt
y_test_pred = grid_rf.predict(X_test)
skplt.metrics.plot_confusion_matrix(y_test, y_test_pred)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_test_pred))

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_test_pred)

In [None]:
from sklearn import metrics
fpr,tpr,threshold=metrics.roc_curve(y_test, y_test_pred)
rou_auc=metrics.auc(fpr,tpr)
plt.title("Reciever Operating Characteristic")
plt.plot(fpr,tpr,"orange",label="AUC-0.4f" % rou_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],color="darkblue",linestyle="--")
plt.ylabel("tpr")
plt.xlabel("fpr")
plt.show()

In [None]:
import xgboost
# Using xgboost on balanced dataset
xg = xgboost.XGBClassifier(scale_pos_weight = 5, random_state = 2) #scale_pos_weight for balancing dataset internally
# Hyper-parameters to be tuned
param_grid = dict(eta = [0.01,0.015, 0.025, 0.05, 0.1], learning_rate = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
                  max_depth = [3,5,7,9,12,15,17,25], min_child_weight = [1,3,5,7], gamma = [0.05,0.1,0.3,0.5,0.7,0.9,1.0], 
                  colsample_bytree = [0.6, 0.7, 0.8, 0.9, 1.0], subsample = [0.6, 0.7, 0.8, 0.9, 1.0],
                  alpha = [0, 0.1, 0.5, 1.0])
grid_xg = RandomizedSearchCV(xg, param_grid, cv=cv, scoring = 'f1_macro')
grid_xg.fit(X_train,y_train)

In [None]:
# Check out best parameters and best score
print(grid_xg.best_score_)
print(grid_xg.best_params_)

In [None]:
y_test_pred = grid_xg.predict(X_test)
skplt.metrics.plot_confusion_matrix(y_test, y_test_pred)

In [None]:
print(classification_report(y_test, y_test_pred))

In [None]:
roc_auc_score(y_test, y_test_pred)

In [None]:
from sklearn import metrics
fpr,tpr,threshold=metrics.roc_curve(y_test, y_test_pred)
rou_auc=metrics.auc(fpr,tpr)
plt.title("Reciever Operating Characteristic")
plt.plot(fpr,tpr,"orange",label="AUC-0.4f" % rou_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],color="darkblue",linestyle="--")
plt.ylabel("tpr")
plt.xlabel("fpr")
plt.show()

In [None]:
from imblearn.ensemble import EasyEnsembleClassifier 
eec = EasyEnsembleClassifier(base_estimator = xgboost.XGBClassifier(), random_state=42)
eec.fit(X_train, y_train)

In [None]:
y_pred = eec.predict(X_test)

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
from sklearn import metrics
fpr,tpr,threshold=metrics.roc_curve(y_test, y_pred)
rou_auc=metrics.auc(fpr,tpr)
plt.title("Reciever Operating Characteristic")
plt.plot(fpr,tpr,"orange",label="AUC-0.4f" % rou_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],color="darkblue",linestyle="--")
plt.ylabel("tpr")
plt.xlabel("fpr")
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
param_grid = dict(C = [0.001, 0.01,1, 10, 100], penalty = ['l1', 'l2'])
grid_lr = RandomizedSearchCV(lr, param_grid, cv=cv, scoring = 'f1_macro')
grid_lr.fit(X_train,y_train)

In [None]:
# Check out best parameters and best score
print(grid_lr.best_score_)
print(grid_lr.best_params_)

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, y_pred)

In [None]:
y_pred = grid_lr.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
from sklearn import metrics
fpr,tpr,threshold=metrics.roc_curve(y_test, y_pred)
rou_auc=metrics.auc(fpr,tpr)
plt.title("Reciever Operating Characteristic")
plt.plot(fpr,tpr,"orange",label="AUC-0.4f" % rou_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],color="darkblue",linestyle="--")
plt.ylabel("tpr")
plt.xlabel("fpr")
plt.show()

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
param_grid = dict(n_neighbors = range(1,25), p = [2, 3])
grid_knn = RandomizedSearchCV(knn, param_grid, cv=cv, scoring = 'f1_macro')
grid_knn.fit(X_train,y_train)

# Check out best parameters and best score
print(grid_knn.best_score_)
print(grid_knn.best_params_)

y_pred = grid_knn.predict(X_test)
print(classification_report(y_test, y_pred))

from sklearn.svm import SVC
svm = SVC()
param_grid = dict(C = [0.001,0.01, 1, 10], gamma = ['auto'], class_weight = ['balanced'])
grid_svm = RandomizedSearchCV(svm, param_grid, cv=cv, scoring = 'f1_macro')
grid_svm.fit(X_train,y_train)

# Check out best parameters and best score
print(grid_svm.best_score_)
print(grid_svm.best_params_)

y_pred = grid_svm.predict(X_test)
print(classification_report(y_test, y_pred))

roc_auc_score(y_test, y_pred)