## Gain experience applying scikit learn to machine learning problems

### Importing required library

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#machine learning
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report

##  1. Loading the HR Attrition Data

In [67]:
import os
path = os.getcwd()
path = os.getcwd() + '/data/IBM-HR-Data-Employee-Attrition.csv'
# importing data for car sales
dataset = pd.read_csv(path,index_col=1)
dataset.head()


Unnamed: 0_level_0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Yes,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,...,1,80,0,8,0,1,6,4,0,5
No,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,...,4,80,1,10,3,3,10,7,1,7
Yes,37,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,...,2,80,0,7,3,3,0,0,0,0
No,33,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,...,3,80,0,8,3,3,8,7,3,0
No,27,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,...,4,80,1,6,3,3,2,2,2,2


### 1.1  Dropping irrevalent  columns

In [68]:
#Get list of columns in the dataset
dataset.columns

Index(['Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [69]:
#Dropping columns (intution)
columns = ['DailyRate', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'HourlyRate', 'MonthlyRate',
        'Over18', 'RelationshipSatisfaction', 'StandardHours']
dataset.drop(columns, inplace=True, axis=1)

In [70]:
#To get description of all columns
dataset.describe(include = 'all')

Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,...,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470,1470,1470.0,1470.0,1470.0,1470,1470.0,1470.0,1470,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
unique,,3,3,,,,2,,,9,...,,,,,,,,,,
top,,Travel_Rarely,Research & Development,,,,Male,,,Sales Executive,...,,,,,,,,,,
freq,,1043,961,,,,882,,,326,...,,,,,,,,,,
mean,36.92381,,,9.192517,2.912925,2.721769,,2.729932,2.063946,,...,15.209524,3.153741,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,,,8.106864,1.024165,1.093082,,0.711561,1.10694,,...,3.659938,0.360824,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,,,1.0,1.0,1.0,,1.0,1.0,,...,11.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,,,2.0,2.0,2.0,,2.0,1.0,,...,12.0,3.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,,,7.0,3.0,3.0,,3.0,2.0,,...,14.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,,,14.0,4.0,4.0,,3.0,3.0,,...,18.0,3.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0


##  2. Data Cleaning 

 ### This phase is the most time consuming yet the most importat one. Here, we filter and   extract only the information that is needed for problem solving. Quality of the model is highly dependant on the quality of the data that is given as an input.

   * Understand meaning of every feature and identify errors.
   * Look for any missing values and find a way to fill the missing values.
   * Remove duplicate or corrupted records.
   * Scaling and normalization of data.
   * Character encoding (string to numerical representation).
   * Handle inconsistent entry.
   * Use tools like pandas(python), dplyr(R), numpy

### 2.1 Handling Missing Values 

In [61]:
#Find number of missing values in every feature
dataset.isnull().sum()
## Seems like Best Data set 

Age                        0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EnvironmentSatisfaction    0
Gender                     0
JobInvolvement             0
JobLevel                   0
JobRole                    0
JobSatisfaction            0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
OverTime                   0
PercentSalaryHike          0
PerformanceRating          0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
WorkLifeBalance            0
YearsAtCompany             0
YearsInCurrentRole         0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

### 2.2 Encode categorical features(in string) as most of the tools works with numbers

In [71]:
#Columns with string values
categorical_column = ['Attrition', 'BusinessTravel', 'Department',
                      'Gender', 'JobRole', 'MaritalStatus', 'OverTime']

#Deep copy the original data
data_encoded = dataset.copy(deep=True)
#Use Scikit-learn label encoding to encode character data
lab_enc = preprocessing.LabelEncoder()
for col in categorical_column:
        data_encoded[col] = lab_enc.fit_transform(df_data[col])
        le_name_mapping = dict(zip(lab_enc.classes_, lab_enc.transform(lab_enc.classes_)))
        print('Feature', col)
        print('mapping', le_name_mapping)




Feature Attrition
mapping {'No': 0, 'Yes': 1}
Feature BusinessTravel
mapping {'Non-Travel': 0, 'Travel_Frequently': 1, 'Travel_Rarely': 2}
Feature Department
mapping {'Human Resources': 0, 'Research & Development': 1, 'Sales': 2}
Feature Gender
mapping {'Female': 0, 'Male': 1}
Feature JobRole
mapping {'Healthcare Representative': 0, 'Human Resources': 1, 'Laboratory Technician': 2, 'Manager': 3, 'Manufacturing Director': 4, 'Research Director': 5, 'Research Scientist': 6, 'Sales Executive': 7, 'Sales Representative': 8}
Feature MaritalStatus
mapping {'Divorced': 0, 'Married': 1, 'Single': 2}
Feature OverTime
mapping {'No': 0, 'Yes': 1}


In [72]:
data_encoded.head()

Unnamed: 0_level_0,Age,BusinessTravel,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,...,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Yes,41,2,2,1,2,2,0,3,2,7,...,3,0,8,0,1,6,4,0,5,1
No,49,1,1,8,1,3,1,2,2,6,...,4,1,10,3,3,10,7,1,7,0
Yes,37,2,1,2,2,4,1,2,1,2,...,3,0,7,3,3,0,0,0,0,1
No,33,1,1,3,4,4,0,3,1,6,...,3,0,8,3,3,8,7,3,0,0
No,27,2,1,2,1,1,1,3,1,2,...,3,1,6,3,3,2,2,2,2,0


In [73]:
data_encoded.isnull().sum()

Age                        0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EnvironmentSatisfaction    0
Gender                     0
JobInvolvement             0
JobLevel                   0
JobRole                    0
JobSatisfaction            0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
OverTime                   0
PercentSalaryHike          0
PerformanceRating          0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
WorkLifeBalance            0
YearsAtCompany             0
YearsInCurrentRole         0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
Attrition                  0
dtype: int64

## 3. Spliting the Data into Traiining and Test Data

In [74]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(data_encoded,
                                                   y,
                                                  test_size=0.2
                                                  )

In [75]:
# X Train
print('On X train: ')
print('X train dimensions: ', X_train.shape)
display(X_train.head())

# X test
print('\nOn X test: ')
print('X test dimensions: ', X_test.shape)
display(X_test.head())

On X train: 
X train dimensions:  (1176, 26)


Unnamed: 0_level_0,Age,BusinessTravel,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,...,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
No,24,2,1,21,2,3,1,2,1,2,...,3,3,2,3,3,1,1,0,0,0
No,18,0,1,5,2,2,1,3,1,6,...,3,0,0,2,3,0,0,0,0,0
No,29,2,2,20,2,4,1,3,2,7,...,3,1,10,2,3,3,2,0,2,0
No,39,2,1,12,3,4,1,3,2,4,...,4,0,7,3,3,5,4,1,0,0
No,31,2,1,20,3,2,1,3,2,2,...,3,1,10,2,3,10,8,0,2,0



On X test: 
X test dimensions:  (294, 26)


Unnamed: 0_level_0,Age,BusinessTravel,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,...,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
No,28,2,2,5,3,4,1,3,2,7,...,3,0,6,4,3,5,4,1,3,0
No,53,2,1,13,2,4,0,4,2,4,...,3,2,5,3,3,4,2,1,3,0
Yes,24,2,0,22,1,4,1,1,1,1,...,3,1,1,2,3,1,0,0,0,1
No,45,2,1,7,3,2,1,3,3,6,...,3,1,25,2,3,1,0,0,0,0
No,36,2,1,5,2,4,1,3,2,2,...,3,0,16,3,4,13,11,3,7,0


In [76]:
# X Train
print('On y train: ')
print('y train dimensions: ', y_train.shape)
display(y_train.head())

# X test
print('\nOn y test: ')
print('y test dimensions: ', y_test.shape)
display(y_test.head())

On y train: 
y train dimensions:  (1176,)


1097    No
727     No
254     No
1175    No
1341    No
Name: Attrition, dtype: object


On y test: 
y test dimensions:  (294,)


1041     No
184      No
1222    Yes
67       No
220      No
Name: Attrition, dtype: object

## 4. Fit into Model

In [77]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10,
              criterion='entropy')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

def print_score(clf, X_train, y_train,
        X_test, y_test,
        train=True):

  if train:
    print("Train Result:")
    print("------------")
    print("Classification Report: \n {}\n".format(classification_report(
      y_train, clf.predict(X_train))))

    print("Confusion Matrix: \n {}\n".format(confusion_matrix(
      y_train, clf.predict(X_train))))

    res = cross_val_score(clf, X_train, y_train,
              cv=10, scoring='accuracy')

    print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
    print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
    print("----------------------------------------------------------")

  elif train == False:

    print("Test Result:")
    print("-----------")
    print("Classification Report: \n {}\n".format(
      classification_report(y_test, clf.predict(X_test))))

    print("Confusion Matrix: \n {}\n".format(
      confusion_matrix(y_test, clf.predict(X_test))))

    print("accuracy score: {0:.4f}\n".format(
      accuracy_score(y_test, clf.predict(X_test))))

    print("-----------------------------------------------------------")

print_score(rf, X_train, y_train,
      X_test, y_test,
      train=True)

print_score(rf, X_train, y_train,
      X_test, y_test,
      train=False)


Train Result:
------------
Classification Report: 
               precision    recall  f1-score   support

          No       1.00      1.00      1.00       978
         Yes       1.00      1.00      1.00       198

    accuracy                           1.00      1176
   macro avg       1.00      1.00      1.00      1176
weighted avg       1.00      1.00      1.00      1176


Confusion Matrix: 
 [[978   0]
 [  0 198]]

Average Accuracy: 	 0.9983
Accuracy SD: 		 0.0034
----------------------------------------------------------
Test Result:
-----------
Classification Report: 
               precision    recall  f1-score   support

          No       1.00      1.00      1.00       255
         Yes       1.00      1.00      1.00        39

    accuracy                           1.00       294
   macro avg       1.00      1.00      1.00       294
weighted avg       1.00      1.00      1.00       294


Confusion Matrix: 
 [[255   0]
 [  0  39]]

accuracy score: 1.0000

---------------------