## Training a classification model

In [1]:
import pandas as pd
import os
import sys
sys.path.insert(0, "../")

from src.classification.classification_models import ClassificationModel
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

### Load generated dataset

In [2]:
df_data = pd.read_csv("../results/preprocessing/05_dataset_scaled_robust.csv")
df_labels_clustering = pd.read_csv("../results/clustering/labels.csv")

df_data = df_data.assign(label=df_labels_clustering["minibatch_k_means_k_4"]) # algoritmo con mejor desempeño

In [3]:
df_data["label"].value_counts()

label
3    2023
2    1669
1    1555
0    1304
Name: count, dtype: int64

In [4]:
min_class_count = df_data['label'].value_counts().min()

# Realizar undersampling para cada clase
balanced_data = pd.DataFrame()
for label in df_data['label'].unique():
    class_data = df_data[df_data['label'] == label]
    resampled_class = resample(class_data,
                               replace=False,
                               n_samples=min_class_count,  # Igualar al tamaño de la clase minoritaria
                               random_state=42)
    balanced_data = pd.concat([balanced_data, resampled_class])
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
print(balanced_data['label'].value_counts())

label
3    1304
1    1304
0    1304
2    1304
Name: count, dtype: int64


In [5]:
responses = balanced_data.pop("label").values

In [6]:
train_data, validation_data, train_response, validation_response = train_test_split(balanced_data, responses, random_state=42, test_size=.30)

### Training model

In [7]:
folder_export = "../results/classification/"

model_instance = ClassificationModel(
    train_values = train_data,
    train_response = train_response,
    test_values = validation_data,
    test_response = validation_response,
    folder_export = folder_export
)

df_exploration, df_curves = model_instance.apply_exploring()
df_exploration.to_csv(f"{folder_export}/results_exploration.csv", index=False)
df_curves.to_csv(f"{folder_export}/results_curves_exploration.csv", index=False)