In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns ; sns.set()
import warnings 
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

In [None]:
df.isna().sum()

In [None]:
df.drop(["StandardHours", "EmployeeCount","EmployeeNumber"],axis=1,inplace=True)
df["Attrition"].replace({"Yes" : 1, "No" : 0},inplace=True)

In [None]:
plt.figure(figsize=(20,18))
sns.heatmap(df.corr(),annot=True);

In [None]:
sns.countplot(x=df["Attrition"])
plt.xticks([0,1],["No","Yes"]);

In [None]:
df.corr()["Attrition"].abs().sort_values(ascending=False).iloc[1:].head(10)

In [None]:
sns.distplot(df["DistanceFromHome"],kde=False)
median_dis = df["DistanceFromHome"].median()
plt.axvline(median_dis,c="r",label="Median")
plt.title("Distribution of Distance")
plt.legend()
plt.show()

In [None]:
sns.catplot(x="Attrition",y="DistanceFromHome",data=df)
plt.xticks([0,1],["No","Yes"])
plt.show()

In [None]:
sns.catplot(x="JobLevel",kind="count",col="Attrition",data=df);

In [None]:
sns.distplot(df["TotalWorkingYears"],kde=False)
mean_wr = df["TotalWorkingYears"].mean()
plt.axvline(mean_wr,c="r",label="Mean")
plt.legend()
plt.show()

In [None]:
sns.catplot(x="JobLevel",y="TotalWorkingYears",hue="Attrition",data=df,palette="Set2")
plt.title("How job level and total working years effect the attrition");

In [None]:
sns.distplot(df["MonthlyIncome"],kde=False)
mean_inc = df["MonthlyIncome"].mean()
plt.axvline(mean_inc,c="r",label="Mean Income")
plt.legend()
plt.show()

In [None]:
sns.catplot(x="JobLevel",y="MonthlyIncome",data=df)
plt.title("Job level on monthly income")
plt.show()

In [None]:
corr = df.corr().loc["MonthlyIncome","TotalWorkingYears"]
plt.scatter(df["MonthlyIncome"],df["TotalWorkingYears"],label=f"Corr : {corr}")
plt.title("Is there any correlation between total working years and income? ")
plt.xlabel("Income")
plt.ylabel("Total Working Years")
plt.legend()
plt.show()

In [None]:
corr2 = df.corr().loc["MonthlyIncome","Age"]
plt.scatter(df["MonthlyIncome"],df["Age"],c="g",label=f"Corr : {corr2}")
plt.title("Is there any correlation between age and income? ")
plt.xlabel("Income")
plt.ylabel("Age")
plt.legend()
plt.show()

### Job Satisfaction
 * Low 
 * Medium
 * High
 * Very High

In [None]:
sns.countplot(x="JobSatisfaction",hue="Attrition",data=df)
plt.xticks([0,1,2,3],['Low','Medium','High','Very High'])
plt.title("Job Satisfaction on Attrition")
plt.show()

### Job Involvement 
 * Low 
 * Medium
 * High
 * Very High

In [None]:
sns.countplot(x="JobInvolvement",hue="Attrition",data=df)
plt.xticks([0,1,2,3],['Low','Medium','High','Very High'])
plt.title("Job Involvement on Attrition")
plt.show()

### Education 
 * Below College
 * College
 * Bachelor
 * Master
 * Doctor

In [None]:
sns.countplot(x="Education",hue="Attrition",data=df)
plt.xticks([0,1,2,3,4],["Below College","College","Bachelor","Master","Doctor"])
plt.title("Education Level on Attrition")
plt.show()

In [None]:
df.select_dtypes(object)

In [None]:
df.groupby("Department").mean()["Attrition"].multiply(100).plot.barh()
plt.title("Attrition rate per department")
plt.legend()
plt.show()

In [None]:
sns.catplot(x="Attrition",kind="count",col="Gender",data=df);

In [None]:
df.groupby("JobRole").mean()["Attrition"].multiply(100).plot.barh();

In [None]:
ohe_cols = list(df.select_dtypes(object).columns)

df = pd.get_dummies(df,columns=ohe_cols,drop_first=True)
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [None]:
X = df.drop("Attrition",axis=1).values
y = df["Attrition"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
rf = RandomForestClassifier(random_state=0)
knn = KNeighborsClassifier()
log = LogisticRegression()
xgb = XGBClassifier(random_state=0)

for i in [rf, knn, log, xgb]:
    cv = cross_val_score(i,X_train,y_train,cv=5)
    print(f"CV score of {str(i)} : {cv.mean()}")

In [None]:
log_params = {"penalty" : ["l1", "l2", "elasticnet", "none"],
              "C" : [0.1,1,10]}
gs1 = GridSearchCV(log,param_grid=log_params,cv=5)
gs1.fit(X_train,y_train)
print("Best score : ",gs1.best_score_)
print("Best parameters : ",gs1.best_params_)

In [None]:
rf_params = {"n_estimators" : [100,150,200,250,300],
             "criterion" : ["gini","entropy"],
             "max_features" : ["auto", "sqrt", "log2"],
             "max_depth": list(range(1,11))}

gs2 = GridSearchCV(rf,param_grid=rf_params,cv=5)
gs2.fit(X_train,y_train)
print("Best score : ",gs2.best_score_)
print("Best parameters : ",gs2.best_params_)

As we see Logistic Regression performs better with optimum hyperparameters..

In [None]:
clf = LogisticRegression(C=1,penalty="l2")
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
# In order to labels aren't in similar count the accuracy score is not enough for us.
# We will look at precision, recall scores.

print("Accuracy of test : ",accuracy_score(y_test,prediction))

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test,prediction),annot=True,cmap="Blues")
plt.title("Confusion Matrix");

In [None]:
print(classification_report(y_test,prediction))

### Let's take a look at importance of features : 

This will help us to understand the this HR Analysis of Attrition.

In [None]:
feature_imp = list(clf.coef_[0])
features = list(df.drop("Attrition",axis=1).columns)

plt.figure(figsize=(16,10))
plt.barh(list(range(44)),feature_imp,label="Features strength")
plt.yticks(list(range(44)),features)
plt.legend()
plt.show()