# Model tuning

## Loading data

> **Important:** Run the `make data` target before evaluating this notebook, so that the processed data is available.

In [1]:
import pandas as pd

inputs = pd.read_csv("../data/processed/inputs.csv")
outputs = pd.read_csv("../data/processed/outputs.csv")

In [2]:
inputs.sample(10)

Unnamed: 0,Age,Debt,YearsEmployed,Income
56,24.33,2.5,4.5,456
517,39.17,1.71,0.125,0
82,28.58,1.665,2.415,0
349,44.17,6.665,7.375,0
33,57.58,2.0,6.5,10
270,36.17,5.5,5.0,687
536,23.33,1.5,1.415,200
241,20.42,1.085,1.5,7
570,46.08,3.0,2.375,4159
529,41.92,0.42,0.21,948


In [3]:
outputs.sample(10)

Unnamed: 0,Approved
53,0
4,0
171,0
519,1
102,0
366,1
23,0
530,1
262,0
395,1


## Training with defaults

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
        inputs, outputs, test_size=0.4, random_state=23
    )

In [6]:
from sklearn.ensemble import RandomForestClassifier

default_model = RandomForestClassifier(verbose=True)

In [7]:
default_model.fit(X_train, y_train)

  default_model.fit(X_train, y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


RandomForestClassifier(verbose=True)

In [8]:
MSE = default_model.score(X_test, y_test)*100
print(f"MSE: {MSE}%")

MSE: 74.13793103448276%


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


## Hyper-parameter search

In [9]:
parameters = {
    "n_estimators":[5,10,50,100,250,500,1000],
    "max_depth":[2,4,8,16,32,None],
    'max_leaf_nodes': [2**i for i in range(1, 8)],
    'max_samples': [i/10.0 for i in range(5, 10)]
}


In [10]:
model = RandomForestClassifier(verbose=True)

In [13]:
from sklearn.model_selection import GridSearchCV

cv = GridSearchCV(model, parameters,cv=5,n_jobs=-1,verbose=1)

In [14]:
result = cv.fit(X_train, y_train)

Fitting 5 folds for each of 1470 candidates, totalling 7350 fits


  self.best_estimator_.fit(X, y, **fit_params)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [16]:
print(f'Best parameters are: {result.best_params_}')

Best parameters are: {'max_depth': 8, 'max_leaf_nodes': 64, 'max_samples': 0.5, 'n_estimators': 10}


## Using found parameters

In [18]:
best_model = RandomForestClassifier(max_depth=result.best_params_['max_depth'],
                                    max_leaf_nodes=result.best_params_['max_leaf_nodes'],
                                    max_samples=result.best_params_['max_samples'],
                                    n_estimators=result.best_params_['n_estimators'],
                                    verbose=True)

In [19]:
best_model.fit(X_train, y_train)

  best_model.fit(X_train, y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


RandomForestClassifier(max_depth=8, max_leaf_nodes=64, max_samples=0.5,
                       n_estimators=10, verbose=True)

In [20]:
MSE = best_model.score(X_test, y_test)*100
print(f"MSE: {MSE}%")

MSE: 75.0%


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
