In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv("../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
df.head()

In [None]:
df.set_index('EmployeeNumber')

In [None]:
df.describe(include="all")

In [None]:
df['Attrition'].replace({"Yes":1,"No":0},inplace=True)

In [None]:
df

In [None]:
df.groupby('Department').mean()['Attrition'].plot(kind='bar',color=['Green','Blue','Pink'])
plt.title("Attrition Rate by Department")


In [None]:
plt.figure(figsize=([10,20]))
plt.subplots(12,figsize=[12,6])
plt.subplot(121)
sns.boxplot(x='Attrition',y='DistanceFromHome',data=df)
plt.title("Attrition Rate by Distance From Home")
plt.subplot(122)
sns.distplot(df['DistanceFromHome'])
plt.xlim(0,30)
sns.despine()

In [None]:

plt.subplots(12,figsize=[12,6])
plt.subplot(121)
sns.boxplot(x='Attrition',y='DailyRate',data=df)
plt.title("Attrition Rate by Daily Rate")
plt.subplot(122)
sns.distplot(df['DailyRate'],color='pink')
plt.xlim(0,1750)
sns.despine()

In [None]:
plt.subplots(12,figsize=[12,6])
plt.subplot(121)
sns.boxplot(x='Attrition',y='TotalWorkingYears',data=df)
plt.title("Attrition Rate by Working Years")
plt.subplot(122)
sns.distplot(df['TotalWorkingYears'],color='pink')

sns.despine()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True)

From the above diagram we can get an understanding that that attrition has a lot todo with monthly income,years at company and total working level.

In [None]:
df.drop(['EmployeeNumber','EmployeeCount'],axis=1,inplace=True)

In [None]:
cat_var=['BusinessTravel','Department','EducationField','Gender','JobRole', 'MaritalStatus', 'Over18','OverTime']

In [None]:
X=pd.get_dummies(df,columns=cat_var,drop_first=True)

In [None]:
X

In [None]:
X=X.drop('Attrition',axis=1)
y=df['Attrition']
X

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
num_var=[]
for i in X.columns:
    if X[i].nunique()>3:
        num_var.append(i)

In [None]:
num_var

In [None]:
scaler=StandardScaler()
X[num_var] = scaler.fit_transform(X[num_var])
X_train,X_test,y_train,y_test=train_test_split(X,y)


In [None]:
lr=LogisticRegression(C=100,max_iter=10000)
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
#lr.score()
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

 This is an interesting phenomenon. The accuracy score at this point is 0.9. However the question is it the only parameter that needs to be looked atis high recall. Employees that are likely to leave the organisation needs to be targeted to avoid them for leaving.For this case we need high recall rather than high precision. Furthermore, the biggest issue in these problems is that these problems are unbalanced. In the next code, we will see how this affects the overall results when we set the class_weight parameter to be balanced 

lr=LogisticRegression(C=100,max_iter=1000,class_weight='balanced')
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
#lr.score()
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

 This is a really good solution because although precision is lower however recall in this case is high which is what we want to see.

In [None]:
from sklearn.svm import SVC
svc=SVC()
svc.fit(X_train,y_train)
svc.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier
dc=DecisionTreeClassifier()
dc.fit(X_train,y_train)
dc.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=10,max_depth=4)
rf.fit(X_train,y_train)
rf.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
feat=rf.feature_importances_
features=pd.DataFrame(feat,index=X.columns,columns=["Feature Importance"]).sort_values(by='Feature Importance',ascending=False)
features.head(8).plot(kind='barh')
plt.title("Feature Importance for Attrition")

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf=AdaBoostClassifier(n_estimators=100)
clf.fit(X_test,y_test)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc=AdaBoostClassifier(n_estimators=100)
gbc.fit(X_test,y_test)
y_pred=gbc.predict(X_test)
print(classification_report(y_test,y_pred))

This is a pretty decent solution. We have been able to increase performance using the AdaBoost Classifer .Boosting has allowed us not only to increase the accuracy but also the recall.