In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## For plotting
import plotly.express as px
import plotly.graph_objects as go
import plotly.tools

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
#from google.colab import files
#uploaded = files.upload()

In [None]:
hrdata = pd.read_csv('HR.csv')

In [None]:
hrdata.shape

In [None]:
hrdata.head()

In [None]:
hrdata.info()

In [None]:
hrdata.Age.hist(bins=30)

In [None]:
hrdata.Age.describe()

In [None]:
hrdata[hrdata['Attrition'] == 'Yes'].Age.describe()

In [None]:
hrdata.Age.hist(figsize=(6,8), bins=30, alpha=0.6)

hrdata[hrdata['Attrition'] == 'Yes'].Age.hist( bins=30, alpha=0.8)

In [None]:
hrdata['Attrition'].value_counts()

In [None]:
hrdata['Attrition'].value_counts(normalize=True)

In [None]:
hrdata.YearsWithCurrManager.hist(figsize=(6,8), bins=15, alpha=0.6)
hrdata[hrdata['Attrition']=='Yes'].YearsWithCurrManager.hist(figsize=(6,8), bins=15, alpha=0.8)

In [None]:
corrMatrix = hrdata.corr()

In [None]:
#figsize=(8,10)
sns.set(rc={'figure.figsize':(25,10)})
sns.heatmap(corrMatrix, annot=True, cmap="Blues")
plt.show()

In [None]:
hrdata.corr()#['Attrition'].sort_values(ascending=False)

In [None]:
num_cols = hrdata.corr().columns

In [None]:
#sns.pairplot(hrdata[["JobInvolvement","StockOptionLevel",
#                 "JobLevel","TotalWorkingYears",
#                 "YearsInCurrentRole","Attrition"]],
#             hue="Attrition")

In [None]:
fig = px.bar(x=hrdata['Attrition'].value_counts().index,
             y=hrdata['Attrition'].value_counts(), 
             title='Attrition Distribution', 
             text=(hrdata['Attrition'].value_counts()/len(hrdata['Attrition'])*100))

fig.update_traces(textposition='outside', 
                  texttemplate='%{text:.4s}%',
                  marker = dict(color = ['silver','gainsboro'],line = dict(color = "black", width = 3)))

fig['layout'].update(height=500, width=600)
fig.show()

In [None]:
fig = px.histogram(x=hrdata['BusinessTravel'], 
                   color=hrdata['Attrition'],
                   barmode='group',
                   color_discrete_sequence=['lemonchiffon','darkkhaki'], 
                   height=500, 
                   width=600,
                  title='Business Travel VS Attrition')
fig.show()

In [None]:
fig = px.pie(values=hrdata['Department'].value_counts(),
             names=hrdata['Department'].value_counts().index, 
             title='Department')
fig.update_traces(marker = dict(colors = ['violet','plum','thistle'],
                                line = dict(color = "mediumpurple", width = 2)))
fig.show()

In [None]:
fig = px.histogram(x=hrdata['Department'], 
                   color=hrdata['Attrition'],
                   barmode='group',
                   color_discrete_sequence=['plum','purple'], 
                   height=500, 
                   width=600,
                  title='Department VS Attrition')
fig.show()

In [None]:
fig = px.bar(x=hrdata['EducationField'].value_counts().index,
             y=hrdata['EducationField'].value_counts(),
             height=500, width=700,
           text=(hrdata['EducationField'].value_counts()/len(hrdata['EducationField'])*100), 
             title='Education Fields Count')

fig.update_traces(textposition='outside', 
                  texttemplate='%{text:.4s}%',
                  marker=dict(color=['dodgerblue','deepskyblue','skyblue','lightskyblue','lightblue','powderblue'],
                                                                                  line=dict(color='navy', width=2)))
fig.show()

In [None]:
fig = px.histogram(x=hrdata['EducationField'],
                   color=hrdata['Attrition'],
                  barmode='group',
                  height=500,
                  width=700,
                  color_discrete_sequence=['cornflowerblue','steelblue'])
fig.show()

In [None]:
hrdata['Attrition'] = hrdata.Attrition.map({'Yes':1,
                                              'No':0})

In [None]:
hrdata.columns

In [None]:
#hrdata.columns
num_cols

In [None]:
hrdata2 = hrdata[num_cols]

In [None]:
x = hrdata2#.drop('Attrition',1) ### Drop before having the target variable
y = hrdata['Attrition']

print(x.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, 
                                                    random_state=2021, 
                                                    test_size=0.2,
                                                   stratify =y)

In [None]:
accuracies =dict()
recall = dict()

In [None]:
# statistical learning
from sklearn.linear_model import LogisticRegression

# tree based methods
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# To measure performance
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [None]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

y_pred = dt.predict(x_test)

In [None]:
accuracies['Decision Tree Classifier'] = accuracy_score(y_pred, y_test)
recall['Decision Tree Classifier'] = metrics.recall_score(y_test,y_pred)

print('Accuracy of Decision Tree Classifier is: ', accuracy_score(y_test,y_pred))
print('Recall Score of Decision Tree Classifier is: ', metrics.recall_score(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix as cfm
from sklearn.metrics import classification_report

In [None]:
cfm(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
lgr = LogisticRegression(max_iter=100000)
lgr.fit(x_train, y_train)

y_pred = lgr.predict(x_test)

accuracies['Logistic Regression'] = accuracy_score(y_test, y_pred)
recall['Logistic Regression'] = metrics.recall_score(y_test,y_pred)

print('Accuracy Score of Logistic Regression is: ', accuracy_score(y_test, y_pred))
print('Recall Score of Logistic Regression Model is: ', metrics.recall_score(y_test, y_pred))

In [None]:
rf = RandomForestClassifier()

rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)


accuracies['Random Forest Classifier'] = accuracy_score( y_test, y_pred)
recall['Random Forest Classifier'] = metrics.recall_score(y_test,y_pred)


print('Accuracy Score of Random Forest Classifier is: ', accuracy_score(y_test, y_pred))
print('Recall Score of Random Forest Classifier Model is: ', metrics.recall_score(y_test, y_pred))

In [None]:
recall = pd.DataFrame(list(recall.items()), columns=['Model', 'Recall Score'])
accuracies = pd.DataFrame(list(accuracies.items()),columns=['Model','Accuracy Score'])

In [None]:
accuracies_df = pd.merge(accuracies, recall, on='Model')
accuracies_df

In [None]:
from sklearn.ensemble import VotingClassifier

ensemble_clf = VotingClassifier(estimators=[('dt', dt),('rf', rf),('lgr', lgr)],
                                voting='hard'
                               )

ensemble_clf.fit(x_train, y_train)
y_pred = ensemble_clf.predict(x_test)


accuracies['Democratic Classifier'] = accuracy_score( y_test, y_pred)
recall['Democratic Classifier'] = metrics.recall_score(y_test,y_pred)


print('Accuracy Score of Democratic Classifier is: ', accuracy_score(y_test, y_pred))
print('Recall Score of Democratic Classifier Model is: ', metrics.recall_score(y_test, y_pred))

In [None]:
?VotingClassifier