# **Exercise 01: *gridsearch***

## Configuration:

Import necessary *Python* packages:

In [1]:
import sys

Add path to own modules:

In [2]:
sys.path.append("../../src", )

Import necessary entities:

In [3]:
from typing import Any
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from pandas import (
    Series,
    DataFrame,
    read_csv,
)
from sklearn.model_selection import (
    GridSearchCV,
    train_test_split,
)

Import own necessary entities:

In [4]:
from machine_learning_models_utilities import (
    get_classification_model_grid_search_results,
)

## Preprocessing:

Create a dictionary for `read_csv()` method callings:

In [5]:
read_csv_params: dict[str, str] = {
    "file": "day_of_week_not_scaled.csv",
    "additional_file": "day_of_week.csv",
    "file_path": "../../data/datasets/",
}

Read the file `day_of_week_no_scaled.csv` to a *Pandas* dataframe:

In [6]:
df: DataFrame = read_csv(
    read_csv_params["file_path"] + read_csv_params["file"],
)

Add a column `day_of_week` to the `df` *Pandas* dataframe:

In [7]:
df["day_of_week"] = read_csv(
    read_csv_params["file_path"] + read_csv_params["additional_file"],
)["day_of_week"]

Check `df` *Pandas* dataframe:

In [8]:
df.head()

Unnamed: 0,num_trials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,day_of_week
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


Prepare features and target variables:

In [9]:
X: DataFrame = df.drop(columns=["day_of_week", ], )
y: Series = df["day_of_week"]

Check `X` and `y` variables:

In [10]:
X.head()

Unnamed: 0,num_trials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
y.head()

0    4
1    4
2    4
3    4
4    4
Name: day_of_week, dtype: int64

Use `train_test_split()` function:

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,
    test_size=0.2,
    random_state=21,
)

Check `X_train`, `X_test`, `y_train`, `y_test` variables:

In [13]:
X_train.head()

Unnamed: 0,num_trials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
860,3,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
385,6,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
422,1,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
326,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
714,47,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
X_test.head()

Unnamed: 0,num_trials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
1087,67,17,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16,1,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
563,14,10,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1381,20,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1199,9,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [15]:
y_train.head()

860    6
385    4
422    5
326    6
714    1
Name: day_of_week, dtype: int64

In [16]:
y_test.head()

1087    1
16      5
563     6
1381    3
1199    2
Name: day_of_week, dtype: int64

## *SVM gridsearch*:

Create a model of *SVC*:

In [17]:
svc_model: SVC = SVC(
    random_state=21,
    probability=True,
)

Create a parameters grid `svc_model_params_grid` for the *SVC* model:

In [18]:
svc_model_params_grid: dict[str, list[Any]] = {
    "gamma": ["auto", "scale", ],
    "class_weight": [None, "balanced", ],
    "kernel": [
        "rbf",
        "linear",
        "sigmoid",
    ],
    "C": [
        0.01,
        0.1,
        1,
        1.5,
        5,
        10,
    ],
}

Create a *gridsearch* model for the *SVC* model:

In [19]:
svc_model_grid_search: GridSearchCV = GridSearchCV(
    cv=5,
    n_jobs=-1,
    scoring="accuracy",
    estimator=svc_model,
    param_grid=svc_model_params_grid,
)

Train the *gridsearch* model of the *SVC* model:

In [20]:
svc_model_grid_search.fit(X, y, );

Print the best *SVC* model parameters:

In [21]:
print(
    f"The best SVC model parameters are {svc_model_grid_search.best_params_}.",
)


The best SVC model parameters are {'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'linear'}.


Print the best *SVC* model *accuracy* metric score:

In [22]:
print(
    f"The best SVC model accuracy metric score is {
        svc_model_grid_search.best_score_:.3f
    }.",
)

The best SVC model accuracy metric score is 0.450.


Create a *Pandas* dataframe from the results of the *gridsearch* model of the *SVC* model:

In [23]:
DataFrame(
    svc_model_grid_search.cv_results_,
).sort_values(
    ascending=True,
    by=["rank_test_score", ],
).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
70,54.718891,13.440513,0.011882,0.00179,10.0,balanced,scale,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.298817,0.534125,0.543027,0.611276,0.264095,0.450268,0.140824,1
67,66.994217,17.743572,0.013213,0.001893,10.0,balanced,auto,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.298817,0.534125,0.543027,0.611276,0.264095,0.450268,0.140824,1
55,55.215419,15.025204,0.025389,0.008334,5.0,balanced,auto,linear,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.292899,0.519288,0.548961,0.501484,0.261128,0.424752,0.121992,3
58,54.588265,13.80985,0.021723,0.003371,5.0,balanced,scale,linear,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.292899,0.519288,0.548961,0.501484,0.261128,0.424752,0.121992,3
60,1.258592,0.149498,0.069074,0.007372,10.0,,auto,rbf,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.292899,0.489614,0.433234,0.489614,0.332344,0.407541,0.081153,5


## *Decision tree gridsearch*:

Create a model of *decision tree*:

In [24]:
tree_model: DecisionTreeClassifier = DecisionTreeClassifier(random_state=21, )

Create a parameters grid `tree_model_params_grid` for the *decision tree* model:

In [25]:
tree_model_params_grid: dict[str, Any] = {
    "max_depth": range(1, 50, ),
    "criterion": ["gini", "entropy", ],
    "class_weight": [None, "balanced", ],
}

Create a *gridsearch* model for the *decision tree* model:

In [26]:
tree_model_grid_search: GridSearchCV = GridSearchCV(
    cv=10,
    n_jobs=-1,
    scoring="accuracy",
    estimator=tree_model,
    param_grid=tree_model_params_grid,
)

Train the *gridsearch* model of the *decision tree* model:

In [27]:
tree_model_grid_search.fit(X, y, );

Print the best *decision tree* model parameters:

In [28]:
print(
    f"The best decision tree model parameters are {
        tree_model_grid_search.best_params_
    }.",
)

The best decision tree model parameters are {'class_weight': None, 'criterion': 'gini', 'max_depth': 22}.


Print the best *decision tree* model *accuracy* metric score:

In [29]:
print(
    f"The best decision tree model accuracy metric score is {
        tree_model_grid_search.best_score_:.3f
    }.",
)

The best decision tree model accuracy metric score is 0.620.


Create a *Pandas* dataframe from the results of the *gridsearch* model of the *decision tree* model:

In [30]:
DataFrame(
    tree_model_grid_search.cv_results_,
).sort_values(
    ascending=True,
    by=["rank_test_score", ],
).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
21,0.011438,0.002175,0.003533,0.000736,,gini,22,"{'class_weight': None, 'criterion': 'gini', 'm...",0.301775,0.47929,...,0.686391,0.757396,0.816568,0.827381,0.642857,0.595238,0.428571,0.620411,0.163139,1
115,0.012514,0.001703,0.003512,0.000898,balanced,gini,18,"{'class_weight': 'balanced', 'criterion': 'gin...",0.319527,0.56213,...,0.710059,0.757396,0.763314,0.767857,0.672619,0.589286,0.5,0.618657,0.136688,2
116,0.017102,0.008418,0.004177,0.002376,balanced,gini,19,"{'class_weight': 'balanced', 'criterion': 'gin...",0.319527,0.56213,...,0.710059,0.757396,0.769231,0.797619,0.672619,0.529762,0.5,0.616864,0.143008,3
117,0.016134,0.008746,0.003112,0.000989,balanced,gini,20,"{'class_weight': 'balanced', 'criterion': 'gin...",0.319527,0.56213,...,0.674556,0.757396,0.792899,0.821429,0.654762,0.529762,0.5,0.614501,0.147327,4
24,0.010819,0.00231,0.003226,0.001009,,gini,25,"{'class_weight': None, 'criterion': 'gini', 'm...",0.301775,0.455621,...,0.692308,0.763314,0.816568,0.857143,0.64881,0.595238,0.428571,0.613923,0.169782,5


## *Random forest tree gridsearch*:

Create a model of *random forest tree*:

In [31]:
tree_forest_model: RandomForestClassifier = RandomForestClassifier(
    random_state=21,
)

Create a parameters grid `tree_forest_model_params_grid` for the *random forest tree* model:

In [32]:
tree_forest_model_params_grid: dict[str, Any] = {
    "max_depth": range(1, 50, ),
    "criterion": ["gini", "entropy", ],
    "class_weight": [None, "balanced", ],
    "n_estimators": [
        5,
        10,
        50,
        100,
    ],
}

Create a *gridsearch* model for the *random forest tree* model:

In [33]:
tree_forest_model_grid_search: GridSearchCV = GridSearchCV(
    cv=10,
    n_jobs=-1,
    scoring="accuracy",
    estimator=tree_forest_model,
    param_grid=tree_forest_model_params_grid,
)

Train the *gridsearch* model of the *random forest tree* model:

In [34]:
tree_forest_model_grid_search.fit(X, y, );

Print the best *random forest tree* model parameters:

In [35]:
print(
    f"The best random forest tree model parameters are {
        tree_forest_model_grid_search.best_params_
    }.",
)

The best random forest tree model parameters are {'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'n_estimators': 100}.


Print the best *random forest tree* model *accuracy* metric score:

In [36]:
print(
    f"The best random forest tree model accuracy metric score is {
        tree_forest_model_grid_search.best_score_:.3f
    }.",
)

The best random forest tree model accuracy metric score is 0.677.


Create a *Pandas* dataframe from the results of the *gridsearch* model of the *random forest tree* model:

In [37]:
DataFrame(
    tree_forest_model_grid_search.cv_results_,
).sort_values(
    ascending=True,
    by=["rank_test_score", ],
).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
79,0.359511,0.009804,0.01266,0.000904,,gini,20,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.402367,...,0.757396,0.739645,0.840237,0.910714,0.815476,0.589286,0.470238,0.677388,0.159695,1
78,0.189072,0.008533,0.009546,0.001748,,gini,20,50,"{'class_weight': None, 'criterion': 'gini', 'm...",0.39645,...,0.775148,0.786982,0.828402,0.910714,0.809524,0.571429,0.464286,0.676779,0.166463,2
715,0.592706,0.091971,0.019443,0.003994,balanced,entropy,32,100,"{'class_weight': 'balanced', 'criterion': 'ent...",0.372781,...,0.775148,0.721893,0.840237,0.886905,0.863095,0.613095,0.47619,0.675053,0.167423,3
638,0.220526,0.020637,0.010145,0.001455,balanced,entropy,13,50,"{'class_weight': 'balanced', 'criterion': 'ent...",0.366864,...,0.804734,0.786982,0.828402,0.892857,0.809524,0.607143,0.458333,0.675011,0.171714,4
698,0.315906,0.076414,0.011679,0.003825,balanced,entropy,28,50,"{'class_weight': 'balanced', 'criterion': 'ent...",0.378698,...,0.810651,0.739645,0.828402,0.880952,0.839286,0.595238,0.47619,0.673841,0.165237,5


## Monitoring progress:

In [38]:
get_classification_model_grid_search_results(
    X=X,
    y=y,
    classification_model=RandomForestClassifier,
    classification_model_params_grid=tree_forest_model_params_grid,
)

  0%|          | 0/784 [00:00<?, ?it/s]

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,std_accuracy,mean_accuracy
251,,entropy,14,100,0.152,0.562
250,,entropy,14,50,0.165,0.551
263,,entropy,17,100,0.156,0.547
491,balanced,gini,25,100,0.165,0.546
526,balanced,gini,34,50,0.158,0.546
...,...,...,...,...,...,...
404,balanced,gini,4,5,0.049,0.243
396,balanced,gini,2,5,0.048,0.224
588,balanced,entropy,1,5,0.046,0.199
392,balanced,gini,1,5,0.047,0.194


## Prediction:

Create the best model:

In [39]:
best_model: RandomForestClassifier = RandomForestClassifier(
    max_depth=14,
    random_state=21,
    n_estimators=100,
    class_weight=None,
    criterion="entropy",
)

Train the best model on the data:

In [40]:
best_model.fit(X_train, y_train, );

Calculate the best model accuracy metric for the data:

In [41]:
print(
    f"The best model accuracy metric is {
        accuracy_score(
            best_model.predict(X_test, ),
            y_test, 
        ):.3f
    }.",
)

The best model accuracy metric is 0.905.
