In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
df = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

# Preprocessing & Cleaning
It is my understanding that a row marked "Yes" for attrition is an employee that will not be replaced once they're gone from the company.
There doesnt appear to be any empty cells. 
There were a few outliers, removed by inter-quantile range.

In [None]:
for col in df.columns:
    print( col, '{nulls:.2f}%'.format(nulls=(df[col].isna().sum() / df.shape[0]) * 100))

# Since EmployeeCount is 1, and Standard Hours is 80 for everyone, lets remove them
df = df.drop(['EmployeeCount','StandardHours','EmployeeNumber'],1)

In [None]:
attr_split = (df['Attrition'].value_counts(normalize=True) * 100)
fig = px.bar(attr_split, 
             x = 'Attrition',
            )
fig.update_layout(
    template="plotly_dark",
)
fig.show()

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

iqr_df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

fig = px.box(iqr_df, 
             y="MonthlyIncome",
            x = 'Attrition',
            color="Gender")
fig.update_layout(
    template="plotly_dark",
)
fig.show()

From the boxplot it appears that employees who have a higher income are less likely to leave.

# Correlation Matrix

In [None]:
corr = df.corr()
fig = px.imshow(corr,
                color_continuous_scale='Reds')

fig.update_layout(
    template="plotly_dark",
)

fig.show()

There appears to be high correlation between:
* Job Level and Monthly Income
* Monthly Income and Years Worked at the company
* Job Performance and Perc. Salary Hike
Lets look into Different job levels, Monthly Income, and Attrition rates

## Job Level & Monthly Income

In [None]:
df_one = df.groupby(by=['JobLevel','Attrition'])['MonthlyIncome'].mean().reset_index()
df_one
fig = go.Figure()

fig.add_trace(
    go.Bar(
        x = df_one[ df_one['Attrition'] == 'Yes' ]['JobLevel'],
        y = df_one[ df_one['Attrition'] == 'Yes' ]['MonthlyIncome'],
        name='Attrition = Yes',
        marker_color='rgb(255, 153, 153)'
                )
)

fig.add_trace(
    go.Bar(
        x = df_one[ df_one['Attrition'] == 'No' ]['JobLevel'],
        y = df_one[ df_one['Attrition'] == 'No' ]['MonthlyIncome'],
        name='Attrition = No',
        marker_color='rgb(153, 255, 153)'
                )
)

fig.update_layout(
    title='Attrition by Monthly Income, by Job Level',
    barmode='group',
    bargap=0.2,
    bargroupgap=0.05,
    template='plotly_dark'
)

fig.show()

Since the levels of attrition are fairly close in each group it doesn't appear that their salaries for each level are inappropriate. If there were higher attrition rates for a certain joblevel then maybe monthly income if affecting their stay. 

In [None]:
df_two = df.groupby(by=['JobLevel'])['Attrition'].value_counts().rename('Count').reset_index()

fig = go.Figure()

fig.add_trace(
    go.Bar(
        x = df_two[ df_two['Attrition'] == 'Yes' ]['JobLevel'],
        y = df_two[ df_two['Attrition'] == 'Yes' ]['Count'],
        name='Attrition = Yes',
        marker_color='rgb(255, 153, 153)'
                )
)

fig.add_trace(
    go.Bar(
        x = df_two[ df_two['Attrition'] == 'No' ]['JobLevel'],
        y = df_two[ df_two['Attrition'] == 'No' ]['Count'],
        name='Attrition = No',
        marker_color='rgb(153, 255, 153)'
                )
)

fig.update_layout(
    title='Attrition by Job Level',
    barmode='group',
    bargap=0.2,
    bargroupgap=0.05,
    template='plotly_dark'
)

fig.show()

To me, this shows that there is more turnover in lower positions but that rate drops as you move up the levels.

# Predicting Your Own Attrition


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
#split dataset in features and target variable

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
test = df.select_dtypes(include=numerics)

feature_cols = test
X = feature_cols
y = df['Attrition'] # Target variable

In [None]:
crit = ['entropy','gini']
splitter = ['best','random']
for c in crit:
    for s in splitter:
        clf = DecisionTreeClassifier(splitter=s,random_state=1,criterion=c)
        clf = clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        print(c,s," Accuracy: {score:.2f}%".format(score= metrics.accuracy_score(y_test, y_pred)*100 ))

In [None]:
# Testing importance of each feature, which provide less information in the decision process
from matplotlib import pyplot
importance = clf.feature_importances_
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
cols = [0,1,2,4,6,7,8,12,13,14,17,18,19,20,21,22]
test2 = test.drop( test.columns[cols],axis=1)
print( test2 )

In [None]:
feature_cols = test2
X = feature_cols
y = df['Attrition'] # Target variable
crit = ['entropy','gini']
splitter = ['best','random']
for c in crit:
    for s in splitter:
        clf = DecisionTreeClassifier(splitter=s,random_state=1,criterion=c)
        clf = clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        print(c,s," Accuracy: {score:.2f}%".format(score= metrics.accuracy_score(y_test, y_pred)*100 ))

In [None]:
#!pip install info_gain
from info_gain import info_gain
each = df.columns

for e in each:
    ig  = info_gain.info_gain(df['Attrition'], df[e])
    print(e,ig)

In [None]:
feature_cols = df[['DailyRate','Age','DistanceFromHome','HourlyRate','JobLevel','JobRole','MaritalStatus','MonthlyIncome','OverTime','PerformanceRating','YearsWithCurrManager', 'StockOptionLevel', 'YearsAtCompany', 'YearsInCurrentRole', 'TotalWorkingYears']]
jobrole = pd.get_dummies(feature_cols['JobRole'])
jobrole = jobrole[:-1]
stats = pd.get_dummies(feature_cols['MaritalStatus'])
stats = stats[:-1]
oTime = pd.get_dummies(feature_cols['OverTime'])
oTime = oTime[:-1]

feature_cols = pd.concat([feature_cols, jobrole], axis=1, sort=False)
feature_cols = pd.concat([feature_cols, stats], axis=1, sort=False)
feature_cols = feature_cols.drop(['JobRole','MaritalStatus','OverTime'],1)

In [None]:
 feature_cols = feature_cols.fillna(0)

In [None]:
X = feature_cols
y = df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

crit = ['entropy','gini']
splitter = ['best','random']
for c in crit:
    for s in splitter:
        clf = DecisionTreeClassifier(splitter=s,random_state=1,criterion=c,max_depth=5)
        clf = clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        print(c,s," Accuracy: {score:.2f}%".format(score= metrics.accuracy_score(y_test, y_pred)*100 ))

In [None]:
#!pip install --upgrade scikit-learn==0.20.3
#!pip install pydotplus
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(clf, 
                out_file=dot_data,  
                filled=True, 
                rounded=True,
                special_characters=True, 
                #feature_names = X_train,
                #class_names=['0','1']
               )
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())