In [52]:
import os
import sys
import random
from typing import Any
from time import time

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.Kfold.kfold import Kfold

In [2]:
df = pd.read_csv("../data/winequality-red.csv", sep=";")
y = df.quality.apply(lambda quality: 0 if quality <= 5 else 1)
X =  MinMaxScaler().fit_transform(df.iloc[:,:-1])

In [3]:
def get_random_combinations_of_parameters(
        parameters: dict,
        number_of_combinations: int
) -> list[int]:
    combinations = []
    for _ in range(number_of_combinations):
        combination = {}
        for parameter, values in parameters.items():
            combination[parameter] = random.choice(values)
        combinations.append(combination)
    return combinations

In [17]:

K = 10
validation_kfold = Kfold(K)
for (x_train_validation,
     x_test_validation,
     y_train_validation,
     y_test_validation) in validation_kfold.split(X, y):
    tunning_kfold = Kfold(K-1)
    for (x_train_tunning,
         x_test_tunning,
         y_train_tunning,
         y_test_tunning) in tunning_kfold.split(x_train_validation, y_train_validation):
        parameters_combinations = get_random_combinations_of_parameters(
            random_forest_parameters_grid,
            20
        )
        for combination in parameters_combinations:
            inicial_time = time()
            cls = RandomForestClassifier(**parameters_combinations)
            final_time = time()
            cls.fit(x_train_tunning, y_train_tunning)
            predictions = cls.predict(x_test_tunning)
            score = f1_score(y_test_tunning, predictions)






In [4]:
random.seed(44)

In [5]:
a = Kfold(3)
a = list(a.split(X, y))

In [28]:
cls = RandomForestClassifier
parameters_grid = {
    "n_estimators": [10, 50, 100, 300],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", 0.2, None]
}

In [51]:
type(RandomForestClassifier)

abc.ABCMeta

In [60]:
def get_best_parameters(
        classifier: Any,
        parameters_grid: dict[str, Any],
        numbers_of_folds: int,
        number_of_parameters_combinations: int
) -> tuple[dict[str, Any], pd.DataFrame]:
    grid_results = []
    parameters_combinations = get_random_combinations_of_parameters(
        parameters_grid,
        number_of_parameters_combinations
    )
    for combination in parameters_combinations:
        tunning_kfold = Kfold(numbers_of_folds)
        times = []
        scores = []
        for (
            x_train_tunning,
            x_test_tunning,
            y_train_tunning,
            y_test_tunning
        ) in tunning_kfold.split(X, y):
            initial_time = time()
            cls = classifier(**combination)
            cls.fit(x_train_tunning, y_train_tunning)
            final_time = time()
            times.append(final_time-initial_time)
            predictions = cls.predict(x_test_tunning)
            score = f1_score(y_test_tunning, predictions)
            scores.append(score)
        mean_time = np.asarray(times).mean()
        scores = np.asarray(scores)
        mean_score = scores.mean()
        std_score = scores.std()
        grid_results.append(
            dict(combination, f1_mean_score=mean_score, f1_score_std=std_score, mean_time=mean_time)
        )
    grid_results = pd.DataFrame(grid_results).sort_values(
        by=["f1_mean_score"],
        ascending=False
    ).reset_index(drop=True)

    best_parameters = grid_results.iloc[:,:-3].to_dict("records")[0]

    return best_parameters, grid_results

In [61]:
parameters_grid = {
    "n_estimators": [10, 50, 100, 300],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", 0.2, None]
}
random_forest_best_parameters, random_forest_tunning_results = get_best_parameters(
    classifier=RandomForestClassifier,
    parameters_grid=parameters_grid,
    numbers_of_folds=10,
    number_of_parameters_combinations=20
)

In [62]:
random_forest_tunning_results.to_csv("../data/results/random_search_tunning_results.csv")
random_forest_best_parameters

{'n_estimators': 100, 'criterion': 'log_loss', 'max_features': 'sqrt'}

In [64]:
random_forest_tunning_results

Unnamed: 0,n_estimators,criterion,max_features,f1_mean_score,f1_score_std,mean_time
0,100,log_loss,sqrt,0.832097,0.03234,0.292921
1,100,gini,0.2,0.823526,0.032258,0.212535
2,300,entropy,log2,0.823058,0.037372,0.777337
3,50,log_loss,sqrt,0.820718,0.029076,0.128911
4,300,gini,0.2,0.819135,0.018461,0.612387
5,100,log_loss,sqrt,0.818911,0.029855,0.271351
6,300,log_loss,log2,0.818422,0.021643,0.766953
7,300,gini,sqrt,0.817021,0.032035,0.676975
8,100,entropy,sqrt,0.815382,0.045687,0.257471
9,300,log_loss,log2,0.815374,0.028283,0.782174
