# Data Preparation

### Loading Dataset

In [1]:
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

import warnings
warnings.simplefilter("ignore")

In [2]:
import sklearn
sklearn.__version__

'1.0.1'

In [3]:
import pandas as pd
import numpy as np
import pandas_profiling
from pandas_profiling import ProfileReport

In [4]:
#pip install wandb

In [5]:
from category_encoders import OneHotEncoder, TargetEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score
from xgboost import XGBClassifier
import wandb
import os

In [6]:
# Loading the dataset
hr_df = pd.read_csv('Employee_Attrition.csv')

In [7]:
hr_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [8]:
hr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [9]:
hr_df['Attrition']=hr_df['Attrition'].map(dict(Yes=1,No=0))

In [10]:
#Splitting dataset into development and production datasets
hr_df_dev, hr_df_prod = train_test_split(hr_df,
                                             train_size = 0.8,
                                             random_state = 42)

### Data Profiling

In [None]:
profile = ProfileReport(hr_df, title = 'Employee Attrition Profiling Report')

In [None]:
profile.to_notebook_iframe()

In [None]:
profile.to_file("emloyee_attrition.html")

### Preliminary data preparation

In [11]:
# removing redundant features
redunt_features = ['EmployeeCount','EmployeeNumber','StandardHours','Over18']
hr_df_dev.drop(redunt_features, inplace=True, axis=1)

In [12]:
# splitting numeric and categorical features
cat_features = ['BusinessTravel', 'Department','Education','EnvironmentSatisfaction',
                'EducationField','Gender','JobInvolvement','JobLevel','JobRole','JobSatisfaction',
                'MaritalStatus','PerformanceRating','RelationshipSatisfaction',
                'StockOptionLevel','WorkLifeBalance','OverTime']
num_features = list(hr_df_dev.columns)
num_features.remove('Attrition')
num_features = list(set(num_features) - set(cat_features))

In [13]:
# separating into predictors and labels
X_dev = hr_df_dev[num_features + cat_features]
y_dev = hr_df_dev['Attrition']

### Splitting dataset into train and test set

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_dev,
                                                    y_dev,
                                                    train_size = 0.8,
                                                    random_state = 100)

In [15]:
X_train.shape

(940, 30)

### Handling imbalanced dataset

In [16]:
from imblearn.over_sampling import SMOTENC

In [17]:
sm=SMOTENC(random_state=42, categorical_features=list(range(14,30)))
X_train, y_train = sm.fit_resample(X_train, y_train)

In [18]:
X_train.head()

Unnamed: 0,TotalWorkingYears,YearsSinceLastPromotion,DailyRate,TrainingTimesLastYear,YearsWithCurrManager,YearsAtCompany,MonthlyRate,YearsInCurrentRole,PercentSalaryHike,NumCompaniesWorked,...,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,WorkLifeBalance,OverTime
0,1,1,459,3,0,1,14753,0,24,1,...,2,1,Research Scientist,4,Single,4,2,0,2,Yes
1,10,1,1085,1,0,7,17725,7,13,4,...,1,1,Laboratory Technician,4,Divorced,3,3,1,2,Yes
2,6,1,408,2,1,2,18300,2,16,5,...,2,1,Research Scientist,2,Married,3,2,0,4,No
3,4,2,1092,3,2,2,26124,2,14,9,...,3,1,Research Scientist,4,Married,3,2,3,4,Yes
4,8,0,289,2,0,0,5355,0,11,7,...,2,1,Laboratory Technician,1,Single,3,3,0,2,No


### Scaling and encoding

In [19]:
#Scaling training data
numeric_transformer = StandardScaler()
numeric_transformer.fit(X_train[num_features])
scaled_num_array = numeric_transformer.transform(X_train[num_features])
scaled_num_train_df = pd.DataFrame(scaled_num_array, columns = num_features)
scaled_num_train_df

Unnamed: 0,TotalWorkingYears,YearsSinceLastPromotion,DailyRate,TrainingTimesLastYear,YearsWithCurrManager,YearsAtCompany,MonthlyRate,YearsInCurrentRole,PercentSalaryHike,NumCompaniesWorked,MonthlyIncome,DistanceFromHome,HourlyRate,Age
0,-1.172950,-0.280540,-0.854367,0.362280,-1.027971,-0.826428,0.045734,-1.027824,2.592974,-0.725103,-0.743459,1.902720,0.314641,-0.706106
1,0.020680,-0.280540,0.804075,-1.291421,-1.027971,0.193275,0.457445,1.099492,-0.599529,0.515191,-0.664507,-0.978276,-0.530117,0.626267
2,-0.509822,-0.280540,-0.989479,-0.464571,-0.716671,-0.656477,0.537100,-0.420020,0.271153,0.928622,-0.718539,2.033674,0.209046,-0.706106
3,-0.775073,0.074597,0.822620,0.362280,-0.405372,-0.656477,1.620960,-0.420020,-0.309302,2.582348,-0.196619,-1.109231,-1.638862,-0.706106
4,-0.244571,-0.635677,-1.304742,-0.464571,-1.027971,-0.996378,-1.256173,-1.027824,-1.179984,1.755485,-0.715046,-0.978276,-1.533267,-0.817137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1547,-1.040324,-0.635677,1.132584,-0.464571,-1.027971,-0.826428,0.746281,-1.027824,-0.889757,-0.311672,-0.652397,1.509857,-0.530117,-0.484044
1548,0.949059,-0.635677,0.520603,2.015981,-1.027971,-0.826428,-1.536004,-1.027824,-0.309302,0.101760,-0.822178,-0.585413,-0.424522,0.404205
1549,-1.172950,-0.635677,-0.790784,0.362280,-1.027971,-0.826428,1.219915,-1.027824,0.851608,-0.725103,-0.769077,1.771766,-0.107738,-1.039199
1550,0.683808,0.074597,-0.313916,2.015981,0.528528,1.212977,-1.224172,0.491687,-1.179984,-0.311672,1.098982,-0.323505,-0.688509,0.404205


In [20]:
#Scaling test data
numeric_transformer = StandardScaler()
numeric_transformer.fit(X_test[num_features])
scaled_num_array = numeric_transformer.transform(X_test[num_features])
scaled_num_test_df = pd.DataFrame(scaled_num_array, columns = num_features)
scaled_num_test_df

Unnamed: 0,TotalWorkingYears,YearsSinceLastPromotion,DailyRate,TrainingTimesLastYear,YearsWithCurrManager,YearsAtCompany,MonthlyRate,YearsInCurrentRole,PercentSalaryHike,NumCompaniesWorked,MonthlyIncome,DistanceFromHome,HourlyRate,Age
0,0.403429,0.187672,-1.438186,0.039443,0.735331,0.965528,1.564801,1.163044,-1.127934,-1.063611,1.097869,-0.299532,-0.851770,-0.414105
1,1.596858,-0.109580,-1.058537,0.039443,-0.609926,-0.874788,-1.214257,-0.691540,2.507864,-0.249957,1.832995,-0.672892,1.086684,1.216519
2,-1.320414,-0.704084,1.738884,0.755493,-1.148028,-1.042090,1.150524,-0.956480,-1.127934,-0.656784,-0.823383,0.322735,0.602070,-0.957646
3,-0.392191,0.187672,0.539989,-0.676606,0.466280,-0.038281,-0.360903,0.633163,-0.608534,-1.063611,-0.707597,-0.921799,0.602070,-0.522813
4,1.464255,-0.109580,-1.535597,0.039443,-0.340874,0.296322,-0.358858,0.898103,0.430265,1.377351,1.397070,-0.175079,-0.512540,1.325227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,0.668635,0.187672,0.372644,2.187592,1.273433,0.630925,-0.303258,0.898103,1.469065,0.563697,-0.822546,-0.921799,0.602070,-0.087980
232,-0.126984,1.376680,0.602432,-0.676606,0.735331,0.129021,0.956877,0.633163,0.949665,2.597832,0.418437,-0.548439,-0.124850,1.651352
233,-0.657398,-0.406832,-1.335781,0.039443,-0.340874,-0.205582,1.128584,0.103282,-0.608534,-0.656784,-0.591602,-1.046253,0.795916,-0.305396
234,0.933842,-0.704084,1.763861,0.755493,-1.148028,-1.042090,1.044093,-1.221421,0.949665,0.156870,0.706960,0.073828,0.456686,0.129436


In [21]:
def OneHotEncoding(categorical_features,df):
    ohe_encoder = OneHotEncoder(handle_unknown='ignore')
    ohe_encoder.fit(df[categorical_features])
    encoded_cat_names = list(ohe_encoder.get_feature_names())
    encoded_cols_df = ohe_encoder.transform(df[categorical_features])
    return encoded_cols_df

In [22]:
ohe_encoded_cat_train_df = OneHotEncoding(cat_features,X_train)
ohe_encoded_cat_train_df

Unnamed: 0,BusinessTravel_1,BusinessTravel_2,BusinessTravel_3,Department_1,Department_2,Department_3,Education,EnvironmentSatisfaction,EducationField_1,EducationField_2,...,JobSatisfaction,MaritalStatus_1,MaritalStatus_2,MaritalStatus_3,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,WorkLifeBalance,OverTime_1,OverTime_2
0,1,0,0,1,0,0,2,4,1,0,...,4,1,0,0,4,2,0,2,1,0
1,0,1,0,1,0,0,4,2,1,0,...,4,0,1,0,3,3,1,2,1,0
2,0,1,0,1,0,0,5,3,0,1,...,2,0,0,1,3,2,0,4,0,1
3,0,1,0,1,0,0,4,1,0,0,...,4,0,0,1,3,2,3,4,1,0
4,1,0,0,1,0,0,2,3,0,0,...,1,1,0,0,3,3,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1547,1,0,0,1,0,0,3,3,1,0,...,3,0,1,0,3,4,0,2,1,0
1548,0,1,0,1,0,0,3,4,0,1,...,2,0,0,1,3,3,0,3,0,1
1549,0,1,0,1,0,0,3,3,1,0,...,4,1,0,0,3,3,0,3,1,0
1550,0,1,0,0,1,0,3,2,0,0,...,1,0,0,1,3,4,1,3,0,1


In [23]:
X_train = scaled_num_train_df.join(ohe_encoded_cat_train_df)
X_train

Unnamed: 0,TotalWorkingYears,YearsSinceLastPromotion,DailyRate,TrainingTimesLastYear,YearsWithCurrManager,YearsAtCompany,MonthlyRate,YearsInCurrentRole,PercentSalaryHike,NumCompaniesWorked,...,JobSatisfaction,MaritalStatus_1,MaritalStatus_2,MaritalStatus_3,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,WorkLifeBalance,OverTime_1,OverTime_2
0,-1.172950,-0.280540,-0.854367,0.362280,-1.027971,-0.826428,0.045734,-1.027824,2.592974,-0.725103,...,4,1,0,0,4,2,0,2,1,0
1,0.020680,-0.280540,0.804075,-1.291421,-1.027971,0.193275,0.457445,1.099492,-0.599529,0.515191,...,4,0,1,0,3,3,1,2,1,0
2,-0.509822,-0.280540,-0.989479,-0.464571,-0.716671,-0.656477,0.537100,-0.420020,0.271153,0.928622,...,2,0,0,1,3,2,0,4,0,1
3,-0.775073,0.074597,0.822620,0.362280,-0.405372,-0.656477,1.620960,-0.420020,-0.309302,2.582348,...,4,0,0,1,3,2,3,4,1,0
4,-0.244571,-0.635677,-1.304742,-0.464571,-1.027971,-0.996378,-1.256173,-1.027824,-1.179984,1.755485,...,1,1,0,0,3,3,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1547,-1.040324,-0.635677,1.132584,-0.464571,-1.027971,-0.826428,0.746281,-1.027824,-0.889757,-0.311672,...,3,0,1,0,3,4,0,2,1,0
1548,0.949059,-0.635677,0.520603,2.015981,-1.027971,-0.826428,-1.536004,-1.027824,-0.309302,0.101760,...,2,0,0,1,3,3,0,3,0,1
1549,-1.172950,-0.635677,-0.790784,0.362280,-1.027971,-0.826428,1.219915,-1.027824,0.851608,-0.725103,...,4,1,0,0,3,3,0,3,1,0
1550,0.683808,0.074597,-0.313916,2.015981,0.528528,1.212977,-1.224172,0.491687,-1.179984,-0.311672,...,1,0,0,1,3,4,1,3,0,1


In [24]:
ohe_encoded_cat_test_df = OneHotEncoding(cat_features,X_test)
ohe_encoded_cat_test_df = ohe_encoded_cat_test_df.reset_index(drop=True)

In [25]:
X_test =scaled_num_test_df .join(ohe_encoded_cat_test_df)
X_test

Unnamed: 0,TotalWorkingYears,YearsSinceLastPromotion,DailyRate,TrainingTimesLastYear,YearsWithCurrManager,YearsAtCompany,MonthlyRate,YearsInCurrentRole,PercentSalaryHike,NumCompaniesWorked,...,JobSatisfaction,MaritalStatus_1,MaritalStatus_2,MaritalStatus_3,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,WorkLifeBalance,OverTime_1,OverTime_2
0,0.403429,0.187672,-1.438186,0.039443,0.735331,0.965528,1.564801,1.163044,-1.127934,-1.063611,...,3,1,0,0,3,4,0,4,1,0
1,1.596858,-0.109580,-1.058537,0.039443,-0.609926,-0.874788,-1.214257,-0.691540,2.507864,-0.249957,...,4,1,0,0,4,2,1,3,1,0
2,-1.320414,-0.704084,1.738884,0.755493,-1.148028,-1.042090,1.150524,-0.956480,-1.127934,-0.656784,...,4,1,0,0,3,4,0,2,0,1
3,-0.392191,0.187672,0.539989,-0.676606,0.466280,-0.038281,-0.360903,0.633163,-0.608534,-1.063611,...,4,0,1,0,3,3,0,2,1,0
4,1.464255,-0.109580,-1.535597,0.039443,-0.340874,0.296322,-0.358858,0.898103,0.430265,1.377351,...,2,1,0,0,3,2,1,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,0.668635,0.187672,0.372644,2.187592,1.273433,0.630925,-0.303258,0.898103,1.469065,0.563697,...,3,0,1,0,4,3,0,3,1,0
232,-0.126984,1.376680,0.602432,-0.676606,0.735331,0.129021,0.956877,0.633163,0.949665,2.597832,...,2,0,1,0,3,3,0,2,0,1
233,-0.657398,-0.406832,-1.335781,0.039443,-0.340874,-0.205582,1.128584,0.103282,-0.608534,-0.656784,...,4,1,0,0,3,4,1,3,0,1
234,0.933842,-0.704084,1.763861,0.755493,-1.148028,-1.042090,1.044093,-1.221421,0.949665,0.156870,...,3,1,0,0,3,3,1,3,1,0


In [None]:
def targetEncoder(cat_features, X_train,y_train):
    target_encoder = TargetEncoder(cols=cat_features, min_samples_leaf=100, smoothing=1000.0)
    target_encoder.fit(X_train, y_train)
    df_encoded = target_encoder.transform(X_train)
    return df_encoded

In [None]:
targetEncoder(cat_features,X_train,y_train)

### Feature Selection

In [26]:
# L1 based
logreg = LogisticRegression( penalty = 'l1', C = 1.0, solver = 'liblinear' )
logreg.fit(X_train, y_train)
print(classification_report( y_test, 
                            logreg.predict(X_test)))
l1_selection_df = pd.DataFrame( {"features": list(X_train.columns),
                                 "coef": np.round(logreg.coef_[0], 2)} )
l1_selection_df[l1_selection_df.coef == 0.0]


              precision    recall  f1-score   support

           0       0.86      0.61      0.71       202
           1       0.15      0.41      0.22        34

    accuracy                           0.58       236
   macro avg       0.51      0.51      0.47       236
weighted avg       0.76      0.58      0.64       236



Unnamed: 0,features,coef
0,TotalWorkingYears,0.0
6,MonthlyRate,-0.0
18,Department_2,0.0
19,Department_3,0.0
22,EducationField_1,0.0
29,Gender_2,0.0
43,MaritalStatus_2,0.0
50,OverTime_2,0.0


In [27]:
#### Sequential Feature Selection

from sklearn.feature_selection import SequentialFeatureSelector

tree = DecisionTreeClassifier( max_depth = 10 )

sfs = SequentialFeatureSelector(tree, n_features_to_select=20)

sfs.fit(X_train, y_train)

sfs_features = [feature for feature, selected in zip(list(X_train.columns), sfs.support_) if selected == True]

sfs_features

['TotalWorkingYears',
 'DailyRate',
 'TrainingTimesLastYear',
 'YearsWithCurrManager',
 'YearsAtCompany',
 'MonthlyRate',
 'PercentSalaryHike',
 'BusinessTravel_1',
 'BusinessTravel_3',
 'Department_3',
 'Education',
 'EducationField_6',
 'JobRole_1',
 'JobRole_5',
 'JobRole_7',
 'JobRole_8',
 'JobRole_9',
 'MaritalStatus_2',
 'StockOptionLevel',
 'OverTime_2']

In [28]:
#### Recursive Feature Elimination (RFE)

from sklearn.feature_selection import RFE

rfe_selector = RFE(tree, 
                   n_features_to_select=20, 
                   step=1, 
                   verbose=1)
rfe_selector.fit(X_train, y_train)

features_rfe = pd.DataFrame({"features": list(X_train.columns),
                             "rank": rfe_selector.ranking_})
pd.set_option('display.max_rows', 1000)
features_rfe.sort_values("rank", ascending=True)

Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 fe

Unnamed: 0,features,rank
0,TotalWorkingYears,1
28,Gender_1,1
31,JobLevel,1
34,JobRole_3,1
41,JobSatisfaction,1
13,Age,1
12,HourlyRate,1
11,DistanceFromHome,1
20,Education,1
10,MonthlyIncome,1


In [29]:
#### Embedded Methods

rf_reg = RandomForestClassifier(n_estimators=100, max_depth = 5, max_features = 0.4)
rf_reg.fit(X_train, y_train)

features_rf_imp = pd.DataFrame({"features": list(X_train.columns),
                                "importance": rf_reg.feature_importances_})
features_rf_imp = features_rf_imp.sort_values("importance", ascending=False).reset_index()
features_rf_imp

features_rf_imp['cumsum'] = features_rf_imp.importance.cumsum()
features_rf_imp

Unnamed: 0,index,features,importance,cumsum
0,47,StockOptionLevel,0.222749,0.222749
1,50,OverTime_2,0.117897,0.340646
2,49,OverTime_1,0.085493,0.426139
3,10,MonthlyIncome,0.074432,0.500571
4,6,MonthlyRate,0.054304,0.554875
5,31,JobLevel,0.047461,0.602336
6,34,JobRole_3,0.040762,0.643098
7,41,JobSatisfaction,0.038965,0.682063
8,5,YearsAtCompany,0.028621,0.710685
9,0,TotalWorkingYears,0.026746,0.73743


### Experiment tracking

In [None]:
os.environ["WANDB_API_KEY"] = "f18186cbea88a0af7b8845f2b0f9033603152679"

In [None]:
def run_model_experiment(model, X_train, y_train, X_test, y_test,name, config = None,tags = None):
    
    model.fit(X_train, y_train)
    wandb.init(project='employee_attrition', config=config, tags = tags)
    wandb.run.name = name
    
    roc_auc = roc_auc_score(y_test, model.predict(X_test))
    wandb.log( {"roc_auc" : roc_auc} )
    
    wandb.sklearn.plot_regressor(model, 
                               X_train, 
                               X_test, 
                               y_train, 
                               y_test,  
                               model_name=name)
    
    wandb.Artifact(name, 
                 type = 'model',
                 description = config)
    wandb.save()
    wandb.finish()
    
    return model

#### Baseline model: Logistic Regression with OHE

In [None]:
logistic_model= LogisticRegression(random_state=42)

In [None]:
logistic_model=run_model_experiment(logistic_model,
                                   X_train,
                                   y_train,
                                   X_test,
                                   y_test,
                                   name = 'LogisticRegressionWithOHE',
                                   tags = ['Logistic Regression', 'baseline', 'OheEncoder'])

#### Random Forest with One Hot Encoding

In [None]:
params = { "n_estimators": 300,
           "max_depth": 10,
           "max_features": .2,
           "max_samples": 0.4 }

In [None]:
rf_model = RandomForestClassifier(**params)
rf_model

In [None]:
rf_model = run_model_experiment(rf_model,
                                X_train,
                                y_train,
                                X_test,
                                y_test,
                                config = params,
                                name = 'RFwithOHE',
                                tags = ['RF', 'OheEncoding'])

#### Gradient Boosted Trees with OHE

In [None]:
params = { "n_estimators": 300,
           "max_depth": 6 }

In [None]:
gbm_model = GradientBoostingClassifier(**params)  

In [None]:
gbm_model = run_model_experiment(gbm_model,
                                X_train,
                                 y_train,
                                 X_test,
                                 y_test,
                                 config = params,
                                 name = 'GBMwithOHEEncoding',
                                 tags = ['GBM', 'OheEncoding'])

#### XGBoost model

In [None]:
params = { "n_estimators": 200,
           "max_depth": 6 }

In [None]:
xgb_classifier = XGBClassifier(**params)

In [None]:
xgb_model = run_model_experiment(xgb_classifier,
                                X_train,
                                y_train,
                                X_test,
                                y_test,
                                config = params,
                                name = 'XGBwithOHE',
                                tags = ['XGB', 'OheEncoding'])

### Grid Search and Best Model Tracking

#### Gradient Boosted Trees with OHE

In [None]:
params = { "n_estimators": [100, 200, 300],
           "max_depth": [4, 6] }

gbm_cv = GridSearchCV(GradientBoostingClassifier(),
                      param_grid = params,
                      cv = 5,
                      scoring = 'roc_auc')  

gbm_cv.fit(X_train, y_train)         

In [None]:
gbm_cv.best_params_

#### Random Forest with One Hot Encoding

In [None]:
params = { "n_estimators": [100, 200, 300],
           "max_depth": [4, 6,10],
          "max_features": [.2,.4],
          "max_samples": [.2,.4]}

rf_cv = GridSearchCV(RandomForestClassifier(),
                      param_grid = params,
                      cv = 5,
                      scoring = 'roc_auc')  

rf_cv.fit(X_train, y_train)  

In [None]:
rf_cv.best_params_