In [1]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [4]:
df = pd.read_csv('data/churn modelling.csv')
df.shape

(10000, 14)

In [5]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
df.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [7]:
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], inplace=True)

In [8]:
cat_cols = df.select_dtypes('O').columns
cat_cols = np.append(cat_cols, ['HasCrCard', 'IsActiveMember'])
cat_cols

array(['Geography', 'Gender', 'HasCrCard', 'IsActiveMember'], dtype=object)

In [9]:
df.select_dtypes(exclude='O').columns

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [10]:
num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [11]:
X = df.drop('Exited', axis=1)
y = df['Exited']

In [12]:
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [13]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [14]:
y.value_counts(normalize=True)

0    0.7963
1    0.2037
Name: Exited, dtype: float64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

## 1. Without Pipeline

In [16]:
ohe = OneHotEncoder(sparse=False)
X_train_cat = ohe.fit_transform(X_train[['Geography','Gender']])
X_train_cat[:5]

array([[1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.]])

In [17]:
num_cols

['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [18]:
ss = StandardScaler()
X_train_num = ss.fit_transform(X_train[num_cols])
X_train_num[:5]

array([[-1.24021723,  0.77986083,  0.35390313, -1.23451386, -0.90298067,
         1.64099027],
       [ 0.75974873, -0.27382717,  0.35390313,  0.2854211 ,  0.81371262,
        -1.55587522],
       [-1.72725557, -0.9443559 , -0.3390904 ,  0.85569573, -0.90298067,
         1.1038111 ],
       [ 0.04473499, -0.17803735,  0.35390313,  0.5180063 ,  0.81371262,
        -1.70935729],
       [-1.92414341, -0.56119662,  0.00740637, -1.23451386,  0.81371262,
        -0.37557412]])

In [19]:
X_train_out = pd.DataFrame(np.concatenate([X_train_cat,
                                           X_train_num,
                                           X_train[['HasCrCard', 'IsActiveMember']]],
                                           axis=1))

In [20]:
X_train_out.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.0,0.0,1.0,0.0,-1.240217,0.779861,0.353903,-1.234514,-0.902981,1.64099,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.759749,-0.273827,0.353903,0.285421,0.813713,-1.555875,1.0,1.0


In [21]:
X_test_cat = ohe.transform(X_test[['Geography', 'Gender']])

In [22]:
X_test_num = ss.transform(X_test[num_cols])

In [23]:
X_test_out = pd.DataFrame(np.concatenate([X_test_cat,
                                          X_test_num,
                                          X_test[['HasCrCard', 'IsActiveMember']]],
                                          axis=1))

In [24]:
X_test_out.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,1.0,0.0,-1.975956,0.109332,0.353903,0.685304,0.813713,-0.152547,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.645761,-1.423305,0.7004,1.137993,-0.902981,0.368185,1.0,1.0


In [25]:
rfc = RandomForestClassifier(random_state=0)

In [26]:
params = {
    'n_estimators' : [10,20],
    'max_depth' : [2,3]
}

In [28]:
gs = GridSearchCV(estimator=rfc, param_grid=params, cv=5, n_jobs=-1)

In [29]:
gs.fit(X_train_out, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': [2, 3], 'n_estimators': [10, 20]})

In [31]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.167304,0.00595,0.014594,0.001357,2,10,"{'max_depth': 2, 'n_estimators': 10}",0.810625,0.80625,0.80875,0.808125,0.8075,0.80825,0.001447,4
1,0.268643,0.029059,0.028185,0.008513,2,20,"{'max_depth': 2, 'n_estimators': 20}",0.81,0.809375,0.809375,0.808125,0.806875,0.80875,0.001118,3
2,0.161312,0.006885,0.018988,0.007373,3,10,"{'max_depth': 3, 'n_estimators': 10}",0.831875,0.82625,0.828125,0.82625,0.81625,0.82575,0.005175,1
3,0.280837,0.036895,0.022987,0.005096,3,20,"{'max_depth': 3, 'n_estimators': 20}",0.826875,0.825625,0.824375,0.820625,0.813125,0.822125,0.004962,2


In [32]:
gs.best_params_

{'max_depth': 3, 'n_estimators': 10}

In [33]:
rf = RandomForestClassifier(random_state=0, **gs.best_params_)

In [34]:
rf.fit(X_train_out, y_train)

RandomForestClassifier(max_depth=3, n_estimators=10, random_state=0)

In [35]:
rf.score(X_test_out, y_test)

0.831

## 2. With pipeline

In [36]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Geography', 'Gender']),
    ('ss', StandardScaler(), num_cols)
], remainder='passthrough')

In [38]:
ct.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('ohe',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['Geography', 'Gender']),
                                ('ss', StandardScaler(),
                                 ['CreditScore', 'Age', 'Tenure', 'Balance',
                                  'NumOfProducts', 'EstimatedSalary'])])

In [39]:
ct.transformers_

[('ohe',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['Geography', 'Gender']),
 ('ss',
  StandardScaler(),
  ['CreditScore',
   'Age',
   'Tenure',
   'Balance',
   'NumOfProducts',
   'EstimatedSalary']),
 ('remainder', 'passthrough', [7, 8])]

In [40]:
pipe = Pipeline([
    ('ct', ct),
    ('model', RandomForestClassifier(random_state=0, n_jobs=-1))
])

In [45]:
params_new = {
    'model__n_estimators' : [10,20],
    'model__max_depth' : [2,3],
    'ct__ss__with_mean' : [True, False]
}

In [46]:
gs_new = GridSearchCV(estimator=pipe, param_grid=params_new, n_jobs=-1)

In [47]:
X_train.head(1)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
493,531,France,Female,47,6,0.0,1,0,0,194998.34


In [48]:
gs_new.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('ohe',
                                                                         OneHotEncoder(handle_unknown='ignore',
                                                                                       sparse=False),
                                                                         ['Geography',
                                                                          'Gender']),
                                                                        ('ss',
                                                                         StandardScaler(),
                                                                         ['CreditScore',
                                                                          'Age',
                                                                

In [49]:
gs_new.best_params_

{'ct__ss__with_mean': True, 'model__max_depth': 3, 'model__n_estimators': 10}

In [50]:
gs_new.best_score_

0.82575