In [1]:
import random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
random.seed(10)

# Informações sobre o dataset

In [4]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


# Quantidade de dados por classe

In [5]:
df.quality.value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

## Mapeamento para classe binárias:

- 0: menor ou igual a 5
- 1: maior que 5

In [6]:
y = df.quality.apply(lambda quality: 0 if quality <= 5 else 1)

In [7]:
y.value_counts()

1    855
0    744
Name: quality, dtype: int64

# Distribuição dos atributos

In [8]:
df.iloc[:,:-1].describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9


## Normalização necessária devido a grande variação entre as escalas dos atributos

In [9]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(df.iloc[:,:-1])
X =  min_max_scaler.fit_transform(df.iloc[:,:-1])

# Estrutura de dados

Lista de tuplas(atributos, classe)

In [10]:
df_values = list(zip(X, y))

In [11]:
df_values[:3]

[(array([0.24778761, 0.39726027, 0.        , 0.06849315, 0.10684474,
         0.14084507, 0.09893993, 0.56754772, 0.60629921, 0.13772455,
         0.15384615]),
  0),
 (array([0.28318584, 0.52054795, 0.        , 0.11643836, 0.14357262,
         0.33802817, 0.2155477 , 0.49412628, 0.36220472, 0.20958084,
         0.21538462]),
  0),
 (array([0.28318584, 0.43835616, 0.04      , 0.09589041, 0.13355593,
         0.1971831 , 0.16961131, 0.50881057, 0.40944882, 0.19161677,
         0.21538462]),
  0)]

In [13]:
K = 4

In [14]:
def initialize_folds(K: int) -> list[list]:
    return [[] for _ in range(K)]

In [15]:
indexes_with_false_label = [index for index, value in enumerate(df_values) if value[1] == 0]
indexes_with_true_label = [index for index, value in enumerate(df_values) if value[1] == 1]

In [16]:
random.shuffle(indexes_with_false_label)
random.shuffle(indexes_with_true_label)

In [17]:
folds = initialize_folds(K)
for _ in range(int(len(df_values) / K)):
    try:
        for fold in folds:
            index_false_label = indexes_with_false_label.pop(0)
            index_true_label = indexes_with_true_label.pop(0)
            fold.extend([index_false_label, index_true_label])
    except IndexError:
        break

In [18]:
def test_folds_totally_disjointed(folds: list[list[int]]) -> None:
    joined_folds = set()
    for fold in folds:
        joined_folds = joined_folds.union(fold)
        
    assert len(joined_folds) == len(folds[0]) * len(folds)

In [19]:
test_folds_totally_disjointed(folds)

In [20]:
for fold in folds:
    print(np.asarray([values[1] for index, values in enumerate(df_values) if index in fold]).mean())

0.5
0.5
0.5
0.5


In [44]:
def generate_splits(folds: list[list[int]]) -> list[tuple[list[int], int]]:
    splits = []
    for index_fold_test, fold in enumerate(folds):
        folds_for_training = []
        splits.append((folds_for_training, index_fold_test))
        for index_fold_train, fold in enumerate(folds):
            if index_fold_train != index_fold_test:
                folds_for_training.append(index_fold_train)

    splits = [(tuple(folds_for_training), fold_for_test) for folds_for_training, fold_for_test in splits]
    return splits

In [42]:
splits = []
for index_fold_test, fold in enumerate(folds):
    folds_for_training = []
    splits.append((folds_for_training, index_fold_test))
    for index_fold_train, fold in enumerate(folds):
        if index_fold_train != index_fold_test:
            folds_for_training.append(index_fold_train)
    #folds_for_training = tuple(folds_for_training)

splits = [(tuple(folds_for_training), fold_for_test) for folds_for_training, fold_for_test in splits]

In [45]:
generate_splits(folds)

[((1, 2, 3), 0), ((0, 2, 3), 1), ((0, 1, 3), 2), ((0, 1, 2), 3)]