## Python ML Model Prediction with Random Forest

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("EmployeeChurn.csv")
data.head()

Unnamed: 0,EmployeeAge,Is_Attrite,Travel,Rate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [2]:
my_tab = pd.crosstab(index=data["Is_Attrite"], columns="count")
data.Is_Attrite.value_counts()/len(data)*100

No     83.877551
Yes    16.122449
Name: Is_Attrite, dtype: float64

##### Create categorial responce to numeric type and do some feature engineering

In [3]:
target_map = {"Yes":1,"No":0}
data.Is_Attrite = data.Is_Attrite.map(target_map)

data['bandMonthlyIncome'] = pd.cut(data['MonthlyIncome'], 5)
data[['bandMonthlyIncome', 'Is_Attrite']].groupby(['bandMonthlyIncome'], as_index=False).mean().sort_values(by='bandMonthlyIncome', ascending=True)
data.loc[data['MonthlyIncome'] <= 990.01, 'MonthlyIncome'] = 0
data.loc[(data['MonthlyIncome'] > 990.01) & (data['MonthlyIncome'] <= 4807.0), 'MonthlyIncome'] = 1
data.loc[(data['MonthlyIncome'] > 4807.0) & (data['MonthlyIncome'] <= 8605.0), 'MonthlyIncome'] = 2
data.loc[(data['MonthlyIncome'] > 8605.0) & (data['MonthlyIncome'] <= 12403.0), 'MonthlyIncome'] = 3
data.loc[(data['MonthlyIncome'] > 12403.0) & (data['MonthlyIncome'] <= 16201.0), 'MonthlyIncome'] = 4
data.loc[(data['MonthlyIncome'] > 16201.0) & (data['MonthlyIncome'] <= 19999.0), 'MonthlyIncome'] = 5
data.loc[data['MonthlyIncome'] > 19999, 'MonthlyIncome'] =6
data.drop(['bandMonthlyIncome'],axis = 1,inplace = True)
data.head()

Unnamed: 0,EmployeeAge,Is_Attrite,Travel,Rate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


##### As EmployeeNumber is unique so better remove this from dependent variables

In [4]:
data = data.drop(["EmployeeNumber"], axis = 1)
data_final = pd.get_dummies(data)
data_final.head()

Unnamed: 0,EmployeeAge,Is_Attrite,Rate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,41,1,1102,1,2,1,2,94,3,2,...,0,0,1,0,0,0,1,1,0,1
1,49,0,279,8,1,1,3,61,2,2,...,0,1,0,0,0,1,0,1,1,0
2,37,1,1373,2,2,1,4,92,2,1,...,0,0,0,0,0,0,1,1,0,1
3,33,0,1392,3,4,1,4,56,3,1,...,0,1,0,0,0,1,0,1,0,1
4,27,0,591,2,1,1,1,40,3,1,...,0,0,0,0,0,1,0,1,1,0


#### Create response and prediction set as below

In [5]:
train = data_final.drop(['Is_Attrite'], axis = 1)
test = data_final['Is_Attrite']

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train, test)
print (x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(1102, 54) (1102,) (368, 54) (368,)


In [7]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state = 42)
lr_model.fit(x_train,y_train)

lr_prediction = lr_model.predict(x_test)

from sklearn.metrics import confusion_matrix, classification_report
labels = ["Negative","Positive"]
print(classification_report(y_test,lr_prediction))
pd.DataFrame(confusion_matrix(y_test,lr_prediction), index = labels, columns = labels)

              precision    recall  f1-score   support

           0       0.84      0.99      0.91       307
           1       0.60      0.05      0.09        61

    accuracy                           0.84       368
   macro avg       0.72      0.52      0.50       368
weighted avg       0.80      0.84      0.77       368



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Negative,Positive
Negative,305,2
Positive,58,3


In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators = 100, n_jobs = -1, random_state = 42)
rf_model.fit(x_train,y_train)

rf_prediction = rf_model.predict(x_test) 

from sklearn.metrics import confusion_matrix, classification_report
labels = ["Negative","Positive"]
print(classification_report(y_test,rf_prediction))
pd.DataFrame(confusion_matrix(y_test,rf_prediction), index = labels, columns = labels)

              precision    recall  f1-score   support

           0       0.86      1.00      0.92       307
           1       0.92      0.18      0.30        61

    accuracy                           0.86       368
   macro avg       0.89      0.59      0.61       368
weighted avg       0.87      0.86      0.82       368



Unnamed: 0,Negative,Positive
Negative,306,1
Positive,50,11


#### Using SMOTE over_sampling as we have imbalanced dataset

In [9]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 42)
o_x_train, o_y_train = smote.fit_resample(x_train,y_train)

In [10]:
from sklearn.ensemble import RandomForestClassifier

o_rf_model = RandomForestClassifier(n_estimators = 100, n_jobs = -1, random_state = 42)
o_rf_model.fit(o_x_train,o_y_train)

o_rf_prediction = o_rf_model.predict(x_test) 

from sklearn.metrics import confusion_matrix, classification_report
labels = ["Negative","Positive"]
print(classification_report(y_test,o_rf_prediction))
pd.DataFrame(confusion_matrix(y_test,o_rf_prediction), index = labels, columns = labels)

              precision    recall  f1-score   support

           0       0.88      0.97      0.92       307
           1       0.66      0.31      0.42        61

    accuracy                           0.86       368
   macro avg       0.77      0.64      0.67       368
weighted avg       0.84      0.86      0.84       368



Unnamed: 0,Negative,Positive
Negative,297,10
Positive,42,19


#### Lets try by building model by using only important features

In [11]:
rf_model.feature_importances_

from sklearn.feature_selection import SelectFromModel

sl_model = SelectFromModel(o_rf_model, prefit=True)
n_x_train = sl_model.transform(o_x_train)
n_x_test = sl_model.transform(x_test)

In [12]:
from sklearn.ensemble import RandomForestClassifier

n_rf_model = RandomForestClassifier(n_estimators = 100, n_jobs = -1, random_state = 42)
n_rf_model.fit(n_x_train,o_y_train)
n_rf_prediction = n_rf_model.predict(n_x_test)
from sklearn.metrics import confusion_matrix, classification_report
labels = ["Negative","Positive"]
print(classification_report(y_test,n_rf_prediction))
pd.DataFrame(confusion_matrix(y_test,n_rf_prediction), index = labels, columns = labels)

              precision    recall  f1-score   support

           0       0.88      0.94      0.91       307
           1       0.54      0.36      0.43        61

    accuracy                           0.84       368
   macro avg       0.71      0.65      0.67       368
weighted avg       0.82      0.84      0.83       368



Unnamed: 0,Negative,Positive
Negative,288,19
Positive,39,22


In [13]:
parameters = {
    'n_estimators' : [100,200,300,400,500,600],
    'max_depth' : [8, 9, 10, 11, 12, 13],
    'min_samples_split' : [2, 5, 10],
}

from sklearn.model_selection import GridSearchCV
gs_model = GridSearchCV(RandomForestClassifier(),parameters, cv = 5, verbose=1, n_jobs = -1)
gs_model.fit(n_x_train,o_y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 15.9min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [8, 9, 10, 11, 12, 13],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 200, 300, 400, 500, 600]},
             verbose=1)

In [15]:
print(gs_model.best_params_)

rf_prediction = gs_model.predict(n_x_test)
from sklearn.metrics import confusion_matrix, classification_report
labels = ["Negative","Positive"]
print(classification_report(y_test,rf_prediction))
pd.DataFrame(confusion_matrix(y_test,rf_prediction), index = labels, columns = labels)

{'max_depth': 13, 'min_samples_split': 2, 'n_estimators': 400}
              precision    recall  f1-score   support

           0       0.88      0.94      0.91       307
           1       0.56      0.38      0.45        61

    accuracy                           0.85       368
   macro avg       0.72      0.66      0.68       368
weighted avg       0.83      0.85      0.84       368



Unnamed: 0,Negative,Positive
Negative,289,18
Positive,38,23
