In [1]:
import os
import sys
import random
from time import time

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.Kfold.kfold import Kfold

In [2]:
df = pd.read_csv("../data/winequality-red.csv", sep=";")
y = df.quality.apply(lambda quality: 0 if quality <= 5 else 1)
X =  MinMaxScaler().fit_transform(df.iloc[:,:-1])

In [3]:
def get_random_combinations_of_parameters(
        parameters: dict,
        number_of_combinations: int
) -> list[int]:
    combinations = []
    for _ in range(number_of_combinations):
        combination = {}
        for parameter, values in parameters.items():
            combination[parameter] = random.choice(values)
        combinations.append(combination)
    return combinations

In [17]:

K = 10
validation_kfold = Kfold(K)
for (x_train_validation,
     x_test_validation,
     y_train_validation,
     y_test_validation) in validation_kfold.split(X, y):
    tunning_kfold = Kfold(K-1)
    for (x_train_tunning,
         x_test_tunning,
         y_train_tunning,
         y_test_tunning) in tunning_kfold.split(x_train_validation, y_train_validation):
        parameters_combinations = get_random_combinations_of_parameters(
            random_forest_parameters_grid,
            20
        )
        for combination in parameters_combinations:
            inicial_time = time()
            cls = RandomForestClassifier(**parameters_combinations)
            final_time = time()
            cls.fit(x_train_tunning, y_train_tunning)
            predictions = cls.predict(x_test_tunning)
            score = f1_score(y_test_tunning, predictions)






In [4]:
random.seed(44)

In [5]:
a = Kfold(3)
a = list(a.split(X, y))

In [27]:
cls = RandomForestClassifier
parameters_grid = {
    "n_estimators": [10, 50, 100, 300],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", 0.2, None]
}

In [None]:
def get_tunned_parameters(cls, parameters_grid, numbers_of_folds, number_of_parameters_combinations):

In [None]:
def get_grid_tunning(classifier, parameters_grid, numbers_of_folds, number_of_parameters_combinations):
    grid_results = []
    parameters_combinations = get_random_combinations_of_parameters(
        parameters_grid,
        number_of_parameters_combinations
    )
    for combination in parameters_combinations:
        tunning_kfold = Kfold(numbers_of_folds)
        for (
            x_train_tunning,
            x_test_tunning,
            y_train_tunning,
            y_test_tunning
        ) in tunning_kfold.split(X, y):
            times = []
            scores = []
            initial_time = time()
            cls = classifier(**combination)
            cls.fit(x_train_tunning, y_train_tunning)
            final_time = time()
            times.append(final_time-initial_time)
            predictions = cls.predict(x_test_tunning)
            score = f1_score(y_test_tunning, predictions)
            scores.append(score)
        mean_time = np.asarray(times).mean()
        scores = np.asarray(scores)
        mean_score = scores.mean()
        std_score = scores.std()
    #     print(f"""index combination: {index_combination},
    # \tparameters = {combination}
    # \tmean score: {mean_score}
    # \tstd score: {std_score}""")
        grid_results.append(
            dict(combination, f1_mean_score=mean_score, f1_score_std=std_score, mean_time=mean_time)
        )
    grid_results = pd.DataFrame(grid_results).sort_values(
        by=["f1_mean_score"],
        ascending=False
    ).reset_index(drop=True)

    return grid_results

In [7]:
random_forest_parameters_grid = {
    "n_estimators": [10, 50, 100, 300],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_features": ["sqrt", "log2", 0.2, None]
}
grid_results = []
K = 10
parameters_combinations = get_random_combinations_of_parameters(
    random_forest_parameters_grid,
    20
)
for combination in parameters_combinations:
    tunning_kfold = Kfold(K)
    for (
        x_train_tunning,
        x_test_tunning,
        y_train_tunning,
        y_test_tunning
    ) in tunning_kfold.split(X, y):
        times = []
        scores = []
        initial_time = time()
        cls = RandomForestClassifier(**combination)
        cls.fit(x_train_tunning, y_train_tunning)
        final_time = time()
        times.append(final_time-initial_time)
        predictions = cls.predict(x_test_tunning)
        score = f1_score(y_test_tunning, predictions)
        scores.append(score)
    mean_time = np.asarray(times).mean()
    scores = np.asarray(scores)
    mean_score = scores.mean()
    std_score = scores.std()
#     print(f"""index combination: {index_combination},
# \tparameters = {combination}
# \tmean score: {mean_score}
# \tstd score: {std_score}""")
    grid_results.append(
        dict(combination, f1_mean_score=mean_score, f1_score_std=std_score, mean_time=mean_time)
    )

In [20]:
random_forest_tunning_results = pd.DataFrame(grid_results).sort_values(by=["f1_mean_score"], ascending=False).reset_index(drop=True)

In [25]:
random_forest_tunning_results.to_csv("../data/results/random_search_tunning_results.csv")

In [26]:
random_forest_tunning_results

Unnamed: 0,n_estimators,criterion,max_features,f1_mean_score,f1_score_std,mean_time
0,300,log_loss,0.2,0.897436,0.0,0.851525
1,300,gini,0.2,0.875,0.0,0.598849
2,50,gini,sqrt,0.835616,0.0,0.124862
3,100,entropy,sqrt,0.833333,0.0,0.259981
4,300,entropy,0.2,0.832215,0.0,0.655196
5,300,gini,0.2,0.826667,0.0,0.60648
6,300,log_loss,log2,0.823529,0.0,0.770371
7,300,gini,log2,0.813793,0.0,0.706848
8,10,log_loss,sqrt,0.811594,0.0,0.027084
9,10,log_loss,,0.811189,0.0,0.056491
