## Grid search

This notebook has the purpose of finding the best estimator for the underlying data. Therefore gridsearch is used to find the best hyperparameters for the random forests model as well as for the logistic regression. 

### 1. Import 

In [1]:
# import libraries
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from ipynb.fs.full.prep_datasets import pipeline_preparation
from ipynb.fs.full.feature_eng import pipeline_feature_eng



In [2]:
# import classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

In [3]:
# import scores and gridsearch
import random
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

### 2. Load the data, prepare the data & feature engineering

In [None]:
df = pd.read_csv("final_df_dec.csv")
df, df_2016 = pipeline_preparation(df)
df, X_train, X_test, y_train, y_test = pipeline_feature_eng(df)

### 3. Grid search random forest

In this section we wanna tune / searching for the best model. Lets see if we could outperform the benchmark mentioned before. This grid represents the best outcome, the intermediat trials are not saved.

In [None]:
def grid_search_ranfor(df, X_train, y_train):
    
    cols = df.columns[0:-1]
    estimator = RandomForestClassifier()
    grid_search_parameter_space = {"bootstrap" : [True],
                                   "class_weight": [None],
                                   "criterion":["gini"],
                                   "max_depth" : [3, 4], 
                                   "max_features" : [10, 12, 14],
                                   "max_leaf_nodes":[None],
                                   "min_impurity_decrease":[0],
                                   "min_samples_leaf" : [9, 12, 15],
                                   "min_samples_split" : [12, 15],
                                   "min_weight_fraction_leaf":[0],
                                   "n_estimators":[10],
                                   "n_jobs":[-1],
                                   "oob_score":[False],
                                   "random_state":[None],
                                   "verbose":[0],
                                   "warm_start":[False]}
    grid_search = GridSearchCV(
                estimator,
                grid_search_parameter_space,
                cv=5,
                scoring="roc_auc", 
                return_train_score=True)
    grid_search.fit(X_train, y_train)
    best_estim = grid_search.best_estimator_
    print("best estimator:", best_estim)
    best_params = grid_search.best_params_
    print("best params:", best_params)
    best_score = grid_search.best_score_
    print("best score:", best_score)

    return best_estim, best_params, best_score

In [7]:
grid_search_ranfor(df2, X_train2, y_train2)

best estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features=14, max_leaf_nodes=None,
                       min_impurity_decrease=0, min_impurity_split=None,
                       min_samples_leaf=9, min_samples_split=12,
                       min_weight_fraction_leaf=0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
best params: {'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 4, 'max_features': 14, 'max_leaf_nodes': None, 'min_impurity_decrease': 0, 'min_samples_leaf': 9, 'min_samples_split': 12, 'min_weight_fraction_leaf': 0, 'n_estimators': 10, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
best score: 0.9279797728857034


(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=4, max_features=14, max_leaf_nodes=None,
                        min_impurity_decrease=0, min_impurity_split=None,
                        min_samples_leaf=9, min_samples_split=12,
                        min_weight_fraction_leaf=0, n_estimators=10, n_jobs=-1,
                        oob_score=False, random_state=None, verbose=0,
                        warm_start=False),
 {'bootstrap': True,
  'class_weight': None,
  'criterion': 'gini',
  'max_depth': 4,
  'max_features': 14,
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0,
  'min_samples_leaf': 9,
  'min_samples_split': 12,
  'min_weight_fraction_leaf': 0,
  'n_estimators': 10,
  'n_jobs': -1,
  'oob_score': False,
  'random_state': None,
  'verbose': 0,
  'warm_start': False},
 0.9279797728857034)

### 4. Grid search logistic regression

In [None]:
def grid_search_logreg(df, X_train, y_train):
    
    cols = df.columns[0:-1]
    estimator = LogisticRegression()
    grid_search_parameter_space = {"penalty": ['l1'],
                                   "C": [2, 5, 7],
                                   "class_weight":["balanced", None],
                                   "solver":["liblinear","saga"],
                                   "multi_class":["auto"],
                                   "max_iter":[20, 40, 60, 80],
                                   "n_jobs":[-1],
                                   "verbose":[0,1]}
    grid_search = GridSearchCV(estimator, 
                               grid_search_parameter_space, 
                               cv=5, 
                               scoring = "roc_auc", 
                               return_train_score = True, 
                               refit = True)
    grid_search.fit(X_train, y_train)
    best_estim = grid_search.best_estimator_
    print("best estimator:", best_estim)
    best_params = grid_search.best_params_
    print("best params:", best_params)
    best_score = grid_search.best_score_
    print("best score:", best_score)
    
    return best_estim, best_params, best_score

In [None]:
grid_search_logreg(df, X_train, y_train)