In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split,GridSearchCV

## Read In Data

In [2]:
hr = pd.read_csv("HR_comma_sep.csv")
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


## Create Dummy Variables

In [3]:
hr = pd.concat([hr,pd.get_dummies(hr['sales'])], axis = 1)
hr = pd.concat([hr,pd.get_dummies(hr['salary'])], axis = 1)
hr = hr.drop(['sales','salary'], axis = 1)

## Modeling

In [4]:
from sklearn.model_selection import train_test_split,GridSearchCV
y = hr['left']
X = hr.drop(['left'],axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify = y)

### KNN with grid search

In [6]:
param_grid = {'n_neighbors':[3,5,7]}
knn = KNeighborsClassifier()
gs = GridSearchCV(knn,param_grid)
gs.fit(X_train,y_train)
confusion_matrix(y_test,gs.predict(X_test))
gs.score(X_test,y_test)

0.9437777777777778

### Random Forest with Tuning (Takes long time)

In [None]:
param_grid = {'n_estimators':[10,100,1000], 'max_depth': [5,10,20,30,None], 'criterion': ['gini','entropy'], 'min_samples_leaf': [1,5,10],'min_samples_split': [2,5,10], 'max_features': ['sqrt','log2']}
rf = RandomForestClassifier()
gs_2 = GridSearchCV(rf,param_grid)
gs_2.fit(X_train,y_train)
gs_2.score(X_test,y_test)

### Random Forest No Tuning

In [11]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
print(rf.score(X_test,y_test))
print(confusion_matrix(y_test,rf.predict(X_test)))
print(precision_score(y_test,rf.predict(X_test)))
print(recall_score(y_test,rf.predict(X_test)))

0.9906666666666667
[[3422    7]
 [  35 1036]]
0.9932885906040269
0.9673202614379085


### XGBoost

In [9]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
print(xgb.score(X_test,y_test))
confusion_matrix(y_test,xgb.predict(X_test))



0.9862222222222222


array([[3410,   19],
       [  43, 1028]])