# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics import accuracy_score

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [3]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
X = df.drop(['dayofweek'], axis=1)
y = df['dayofweek']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=21, 
                                                    stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [5]:
%%time

param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None],
    'probability': [True]
}

svc = SVC(random_state=21)

grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', return_train_score=True)

grid_search.fit(X_train, y_train)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Best parameters: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}
Best cross-validation score: 0.88


In [6]:
grid_search_results_df = pd.DataFrame(grid_search.cv_results_)

sorted_grid_df = grid_search_results_df.sort_values(by='rank_test_score', ascending=True)

sorted_grid_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,param_probability,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
70,0.748789,0.032432,0.0251,0.002907,10,,auto,rbf,True,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",...,0.876109,0.018419,1,0.950835,0.948052,0.958256,0.953661,0.955514,0.953264,0.003556
64,0.76197,0.023671,0.026296,0.002316,10,balanced,auto,rbf,True,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",...,0.8635,0.01087,2,0.939703,0.94898,0.943414,0.952734,0.947173,0.946401,0.004498
58,0.680838,0.014766,0.02812,0.004032,5,,auto,rbf,True,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",...,0.816018,0.008116,3,0.896104,0.904453,0.886827,0.89342,0.907322,0.897625,0.007446
52,0.714784,0.028591,0.028406,0.004212,5,balanced,auto,rbf,True,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",...,0.808608,0.021007,4,0.890538,0.895176,0.887755,0.894347,0.899907,0.893545,0.004156
63,52.987937,5.077452,0.013457,0.00033,10,balanced,auto,linear,True,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",...,0.721052,0.034438,5,0.762523,0.781076,0.769017,0.759036,0.78962,0.772254,0.011483


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [7]:
%%time

param_grid = {
    'max_depth': np.arange(1, 50),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini']
}

dtree_model = DecisionTreeClassifier(random_state=21)

grid_search = GridSearchCV(dtree_model, param_grid, cv=5, scoring='accuracy', return_train_score=True)

grid_search.fit(X_train, y_train)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Best parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22}
Best cross-validation score: 0.87
CPU times: user 12.2 s, sys: 20 ms, total: 12.2 s
Wall time: 12.2 s


In [8]:
grid_search_results_df = pd.DataFrame(grid_search.cv_results_)

sorted_grid_df = grid_search_results_df.sort_values(by='rank_test_score', ascending=True)
sorted_grid_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
70,0.007853,0.001792,0.002475,0.000357,balanced,gini,22,"{'class_weight': 'balanced', 'criterion': 'gin...",0.885185,0.862963,...,0.873121,0.023998,1,0.999072,0.990724,1.0,1.0,1.0,0.997959,0.003636
69,0.009776,0.001135,0.003432,0.000553,balanced,gini,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.859259,...,0.873121,0.0263,2,0.999072,0.986085,1.0,0.99722,0.999073,0.99629,0.005182
97,0.006901,0.000424,0.002343,0.000257,balanced,gini,49,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,...,0.873116,0.023911,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
95,0.006903,0.00034,0.002389,0.000228,balanced,gini,47,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,...,0.873116,0.023911,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
94,0.007224,0.000547,0.002873,0.000854,balanced,gini,46,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,...,0.873116,0.023911,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [9]:
%%time

param_grid = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': range(1, 50),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini']
}

rf_model = RandomForestClassifier(random_state=21)

grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy', return_train_score=True)

grid_search.fit(X_train, y_train)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Best parameters: {'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50}
Best cross-validation score: 0.90
CPU times: user 7min 59s, sys: 1 s, total: 8min
Wall time: 8min


In [10]:
grid_search_results_df = pd.DataFrame(grid_search.cv_results_)

sorted_grid_df = grid_search_results_df.sort_values(by='rank_test_score', ascending=True)
sorted_grid_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
698,0.134274,0.01954,0.009025,0.001949,,gini,28,50,"{'class_weight': None, 'criterion': 'gini', 'm...",0.922222,...,0.90429,0.010961,1,1.0,1.0,0.999072,1.0,0.999073,0.999629,0.000454
711,0.247406,0.010215,0.012833,0.000343,,gini,31,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.918519,...,0.903547,0.01438,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
314,0.134753,0.015891,0.009195,0.001639,balanced,gini,30,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922222,...,0.902817,0.013554,3,1.0,1.0,0.999072,1.0,0.999073,0.999629,0.000454
330,0.127449,0.002675,0.008304,0.000389,balanced,gini,34,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922222,...,0.902809,0.01301,4,1.0,1.0,1.0,1.0,0.999073,0.999815,0.000371
702,0.122353,0.000596,0.008133,0.00028,,gini,29,50,"{'class_weight': None, 'criterion': 'gini', 'm...",0.918519,...,0.902806,0.011698,5,1.0,1.0,0.999072,1.0,0.999073,0.999629,0.000454


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [19]:
n_estimators_values = [5, 10, 50, 100]
max_depth_values = range(1, 50)
class_weight_values = ['balanced', None]
criterion_values = ['entropy', 'gini']

results = []

for n_estimators in n_estimators_values:
    for max_depth in max_depth_values:
        for class_weight in class_weight_values:
            for criterion in criterion_values:
                rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, class_weight=class_weight, criterion=criterion, random_state=21)
                
                scores = cross_val_score(rf_model, X_train, y_train, cv=5, n_jobs=-1)
                
                mean_accuracy = np.mean(scores)
                std_accuracy = np.std(scores)
                
                results.append({
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'class_weight': class_weight,
                    'criterion': criterion,
                    'mean_accuracy': mean_accuracy,
                    'std_accuracy': std_accuracy
                })

In [12]:
results_df = pd.DataFrame(results)

sorted_results_df = results_df.sort_values(by='mean_accuracy', ascending=False)
sorted_results_df.head()

Unnamed: 0,n_estimators,max_depth,class_weight,criterion,mean_accuracy,std_accuracy
503,50,28,,gini,0.90429,0.010961
711,100,31,,gini,0.903547,0.01438
509,50,30,balanced,gini,0.902817,0.013554
525,50,34,balanced,gini,0.902809,0.01301
783,100,49,,gini,0.902806,0.01046


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [14]:
model = RandomForestClassifier(**grid_search.best_params_)
model.fit(X_train, y_train)

In [15]:
accuracy_score(y_test, model.predict(X_test))

0.9260355029585798