In [None]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns

sns.set(style="darkgrid")

In [None]:
# load data 
df = pd.read_csv("../input/HR_comma_sep.csv");

# get the column names to list
col_names = df.columns.tolist()

print("Column names:")
print(col_names)

print("\nSample data:")
df.head()

In [None]:
# we have 14999 rows and 10 columns
df.shape

## 1 Feature Engineering




In [None]:
# change the names of sales to department
df = df.rename(columns = {'sales':'department'})

df.head()

In [None]:
# check is the data contains 'null values'
df.isnull().any()

In [None]:
# check what athe departments are 
df['department'].unique()

In [None]:
#numpy.where(condition[, x, y])
#Return elements, either from x or y, depending on condition.

# turn support category in technical category
df['department'] = np.where(df['department'] == 'support', 'technical', df['department'])

# turn IT in technical category
df['department'] = np.where(df['department'] == 'IT' , 'technical', df['department'])

df['department'].unique()

## 2 Data Exploration

Lets find out how many people left the company.



In [None]:
df['left'].value_counts()

In [None]:
3571/11428

In [None]:
# check the numbers across people that left and people that didnt left

# pandas groupby function allows you to group by certain features
df.groupby('left').mean()

Observations;

- The average satisfaction level of employees who stayed with the company is higher than that of the employees who left.
- The average monthly work hours of employees who left the company is more than that of the employees who stayed.
- The employees who had workplace accidents are less likely to leave than that of the employee who did not have workplace accidents.
- The employees who were promoted in the last five years are less likely to leave than those who did not get a promotion in the last five years.

Now we also want to get a sort of average for categorical variables; **department, salary and number_of_projects**

In [None]:
df.groupby('department').mean()

In [None]:
df.groupby('salary').mean()

## 3 Visualisation 

Get better insight into the data, a clearer picture. Recognise the significant features.


In [None]:
# Compute a simple cross-tabulation of two (or more) factors

pd.crosstab(df.department, df.left).plot(kind='bar')
plt.title('Turnover Frequency per Department')
plt.xlabel('Department')
plt.ylabel('0; stayed | 1; left')

In [None]:
table = pd.crosstab(df.salary, df.left)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Turnover Frequency and Salary')
plt.xlabel('Salary')
plt.ylabel('0; stayed | 1; left')

In order to use all the data for modelling, we need to convert the categorical variables to dummy variables.

**Dummy variables**

Dummy variables are used when you want to work with categorical variables that have no quantifiable relationship with each other. 
We assign 0 to each category that is not it and 1 to each category that it is. We sort of convert it to binary.

This is the process:

1- convert categorical variables to dummy variables

2- delete the old categorical variables



In [None]:
# convert to dummies
cat_vars=['department','salary']

for var in cat_vars:
    cat_list='var'+'_'+ var
    cat_list = pd.get_dummies(df[var], prefix=var) # convert to dummy variables
    df1 = df.join(cat_list)
    df = df1

In [None]:
# remove the old categorical variables
df.drop(df.columns[[8,9]], axis=1, inplace=True)
df.columns.values

In [None]:
# the outcome variable is left (y) all the other variables are predictors

df_vars = df.columns.values.tolist()
y=['left']
X=[i for i in df_vars if i not in y]

## 4 Feature Selection

- We only want to pick the features that are truly relevant for predicting y ( whether someone left or not )
- How do we select the right features / predictors?

We can use sk.learn's ```sklearn.feature_selection.RFE ```

Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to **select features by recursively considering smaller and smaller sets of features**. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a ```coef_``` attribute or through a ```feature_importances_``` attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.


To do:
check other methods for feature selection

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

rfe = RFE(model, 10)
rfe = rfe.fit(df[X], df[y])
print(rfe.support_)
print('the selected features are ranked with 1')
print(rfe.ranking_)

In [None]:
# so these are the columns that we should select
cols = ['satisfaction_level', 'last_evaluation', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 
        'department_hr', 'department_management', 'salary_high', 'salary_low'] 
# the predictors
X = df[cols]

# the outcome 
Y = df['left']

In [None]:
# create a train and a test set
from sklearn.cross_validation import train_test_split

# all lowercase for random forest
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

from sklearn.metrics import accuracy_score
print('Random Forest Accuracy: {:.3f}'.format(accuracy_score(y_test, rf.predict(x_test))))

# XGBoost

#### Hyperparameters

- learning_rate: step size shrinkage used to prevent overfitting. Range is [0,1]
- max_depth: determines how deeply each tree is allowed to grow during any boosting round.
- subsample: percentage of samples used per tree. Low value can lead to underfitting.
- colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
- n_estimators: number of trees you want to build.
- objective: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability.

XGBoost also supports regularization parameters to penalize models as they become more complex and reduce them to simple (parsimonious) models.

- gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits. Supported only for tree-based learners.
- alpha: L1 regularization on leaf weights. A large value leads to more regularization.
- lambda: L2 regularization on leaf weights and is smoother than L1 regularization.

In [None]:
import xgboost as xgb

# we first have to convert the dataset into an optimised data structure that xgb supports
data_dmatrix = xgb.DMatrix(data=X,label=Y)

In [None]:
from sklearn.model_selection import train_test_split

# split data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=123)

In [None]:
# instantiate an XGBoost regressor object by calling the XGBregressor() class from the xgboost library
# pass the necessary hyperparameters as arguments

xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xg_reg.fit(X_train,Y_train)

In [None]:
from sklearn.metrics import accuracy_score
print('XGBoost Accuracy: {:.3f}'.format(accuracy_score(Y_test, xg_reg.predict(X_test))))

## 5 Precision and Recall

- Knowing the accuracy of a model is not enough, we need to know the precision and recall.
- In many tasks such as imbalanced classification problems the accuracy is certainly not the whole story.

Now, we visualise predictions and evaluate the accuracy of a classification.

Then we can compare the true accuracy of the two models - Random Forest & XGBoost

- **Precision**: the fraction of relevant instances among the retrieved instances. ( i.e. the ability of a classification model to identify only the relevant data points)

> *precision = number of true positives / n true positives + n false positives*


- **Recall**: the fraction of relevant instances that have been retrieved over the total amount of relevant instances. ( i.e. the ability of a model to find all the relevant cases within a dataset )

> *recall = number of true positives / n  true positives + n false negatives*

![](https://upload.wikimedia.org/wikipedia/commons/2/26/Precisionrecall.svg)


e.g. When a search engine returns 30 pages only 20 of which were relevant while failing to return 40 additional relevant pages, its precision is 20/30 = 2/3 while its recall is 20/60 = 1/3. So, in this case, precision is "how useful the search results are", and recall is "how complete the results are".


- In cases where we want to find an optimal blend of precision and recall we can combine the two of them in what is called an **F1 score**. The F1 score is the harmonic mean of precision and recall taking both metrics into account in the following equation:

> *F1 = 2 x ( precision x recall ) / precision + recall *

Its better to use the F1 score instead of a normal average because this way we punish the extreme values. 

In [None]:
# Random Forest model precision and recall
from sklearn.metrics import classification_report

# use sklearn to give us the report
print(classification_report(y_test, rf.predict(x_test)))

In [None]:
# confusion matrix for Random Forrest
y_pred = rf.predict(x_test)

from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn import metrics

forest_cm = metrics.confusion_matrix(y_pred, y_test, [1,0])
sns.heatmap(forest_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] )

plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Random Forest')
plt.savefig('random_forest')


For this dataset, **recall**  measures: when an employee left, how often is that predicted correctly?
Out of all the turnover cases, random forest correctly got 987 out of 1038. This means we have a turnover “recall” of about 95% (987/1038)

**Precision** measures in this case: when the model predicts an employee will leave, how often do they actually leave? 
The Random Forest has about 95% precision ( 87 out of 1045) 