In [None]:
# !pip3 install -U keras-tuner
# !pip3 install tensorflow
# !pip3 install imblearn

In [32]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import keras as k
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report, confusion_matrix
import json

In [2]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [3]:
dataset_url = 'https://www.openml.org/data/get_csv/4965303/flare.arff' 
dataset = np.genfromtxt(dataset_url, delimiter=',', skip_header=1)

x = dataset[:,:-4]
x_size = x.shape[1]
y = dataset[:,-1]

In [4]:
x.shape

(1066, 7)

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([("columns", OneHotEncoder(), [0,1])], remainder = 'passthrough')
x = ct.fit_transform(x)
x[0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 3., 1., 3., 1., 1.])

In [6]:
x.shape

(1066, 17)

### <font color="#CA3532">Definición del modelo</font>


In [7]:
# Definid el modelo con Keras

nn = Sequential()

### -------------------------------------------------------------------------------
### Añadir la capas completamente conectadas que consideréis al modelo
### -------------------------------------------------------------------------------
nn.add(Dense(12, activation="sigmoid"))
nn.add(Dense(12, activation="sigmoid"))
nn.add(Dense(1, activation="sigmoid"))

In [8]:
metrics = [
            keras.metrics.BinaryAccuracy(name='ACC'),
            keras.metrics.Precision(name='Prec'),
            keras.metrics.Recall(name='Rec'),
            keras.metrics.AUC(name='AUC'),
          ]

nn.compile(optimizer='SGD', loss="mse", metrics=metrics)
# nn.compile(optimizer='Adam', loss="binary_crossentropy", metrics=metrics)

### <font color="#CA3532">Conjuntos de entrenamiento y validación</font>

In [9]:
# Contrucción de los conjuntos de entrenamiento y validación

from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, stratify=y, test_size=0.2) 

### <font color="#CA3532">Visualización de resultados</font>

In [10]:
from sklearn.metrics import classification_report, confusion_matrix

def show_metrics(history):
    for metric in history.history.keys():
        if not metric.startswith('val_'):
            plt.plot(history.history[metric], label=metric)
            plt.plot(history.history[f'val_{metric}'], label=f'val_{metric}')
            plt.title(metric)
            plt.ylabel('')
            plt.xlabel('Epoch')
            plt.legend(loc="upper left")
            plt.show()

### <font color="#CA3532">Entrenamiento de la red neuronal y evaluación</font>
Como podrás haber observado en problema no está balanceado (o está bastante desequilibrado), porque el número de ejemplos de cada clase es muy diferente.

In [11]:
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.metrics import f1_score

#### RandomUnderSampler

In [34]:
undersample = RandomUnderSampler()
X_under_train, y_under_train = undersample.fit_resample(x_train, y_train)
print(Counter(y_under_train))

Counter({0.0: 145, 1.0: 145})


In [35]:
epochs = 50
history = nn.fit(X_under_train, y_under_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_under_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_under_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))

print(confusion_matrix(y_val, y_pred))
# La matriz de confusión en funcion del valor se le de a los pesos tiene una 
# precisión total para los registros de una clase y precisión nula para la otra clase

Train
              precision    recall  f1-score   support

         0.0       0.70      0.74      0.72       145
         1.0       0.72      0.69      0.71       145

    accuracy                           0.71       290
   macro avg       0.71      0.71      0.71       290
weighted avg       0.71      0.71      0.71       290

Test
              precision    recall  f1-score   support

         0.0       0.90      0.68      0.78       177
         1.0       0.29      0.62      0.40        37

    accuracy                           0.67       214
   macro avg       0.59      0.65      0.59       214
weighted avg       0.79      0.67      0.71       214

[[121  56]
 [ 14  23]]


#### EditedNearestNeighbours


In [14]:
enn = EditedNearestNeighbours()
X_under_train, y_under_train = enn.fit_resample(x_train, y_train)
print(Counter(y_under_train))

Counter({0.0: 506, 1.0: 145})


In [15]:
epochs = 50
history = nn.fit(X_under_train, y_under_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_under_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_under_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.78      1.00      0.87       506
         1.0       0.00      0.00      0.00       145

    accuracy                           0.78       651
   macro avg       0.39      0.50      0.44       651
weighted avg       0.60      0.78      0.68       651

Test
              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91       177
         1.0       0.00      0.00      0.00        37

    accuracy                           0.83       214
   macro avg       0.41      0.50      0.45       214
weighted avg       0.68      0.83      0.75       214

[[177   0]
 [ 37   0]]


#### TomekLinks

In [16]:
tl = TomekLinks()
X_under_train, y_under_train = tl.fit_resample(x_train, y_train)
print(Counter(y_under_train))

Counter({0.0: 706, 1.0: 145})


In [17]:
epochs = 50
history = nn.fit(X_under_train, y_under_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_under_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_under_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91       706
         1.0       0.00      0.00      0.00       145

    accuracy                           0.83       851
   macro avg       0.41      0.50      0.45       851
weighted avg       0.69      0.83      0.75       851

Test
              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91       177
         1.0       0.00      0.00      0.00        37

    accuracy                           0.83       214
   macro avg       0.41      0.50      0.45       214
weighted avg       0.68      0.83      0.75       214

[[177   0]
 [ 37   0]]


#### RandomOverSampler

In [18]:
oversample = RandomOverSampler()
X_over_train, y_over_train = oversample.fit_resample(x_train, y_train)
print(Counter(y_over_train))

Counter({0.0: 707, 1.0: 707})


In [19]:
epochs = 50
history = nn.fit(X_over_train, y_over_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_over_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_over_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.71      0.72      0.72       707
         1.0       0.72      0.71      0.71       707

    accuracy                           0.71      1414
   macro avg       0.72      0.71      0.71      1414
weighted avg       0.72      0.71      0.71      1414

Test
              precision    recall  f1-score   support

         0.0       0.89      0.66      0.75       177
         1.0       0.27      0.59      0.37        37

    accuracy                           0.64       214
   macro avg       0.58      0.62      0.56       214
weighted avg       0.78      0.64      0.69       214

[[116  61]
 [ 15  22]]


#### SMOTE

In [20]:
sm = SMOTE()
X_over_train, y_over_train = sm.fit_resample(x_train, y_train)
print(Counter(y_over_train))

Counter({0.0: 707, 1.0: 707})


In [21]:
epochs = 50
history = nn.fit(X_over_train, y_over_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_over_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_over_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.70      0.72      0.71       707
         1.0       0.71      0.69      0.70       707

    accuracy                           0.71      1414
   macro avg       0.71      0.71      0.71      1414
weighted avg       0.71      0.71      0.71      1414

Test
              precision    recall  f1-score   support

         0.0       0.88      0.65      0.75       177
         1.0       0.26      0.59      0.36        37

    accuracy                           0.64       214
   macro avg       0.57      0.62      0.56       214
weighted avg       0.78      0.64      0.68       214

[[115  62]
 [ 15  22]]


#### ADASYN

In [22]:
ada = ADASYN()
X_over_train, y_over_train = ada.fit_resample(x_train, y_train)
print(Counter(y_over_train))

Counter({0.0: 707, 1.0: 688})


In [23]:
epochs = 50
history = nn.fit(X_over_train, y_over_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_over_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_over_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.65      0.72      0.68       707
         1.0       0.68      0.61      0.64       688

    accuracy                           0.66      1395
   macro avg       0.67      0.66      0.66      1395
weighted avg       0.67      0.66      0.66      1395

Test
              precision    recall  f1-score   support

         0.0       0.89      0.66      0.75       177
         1.0       0.27      0.59      0.37        37

    accuracy                           0.64       214
   macro avg       0.58      0.62      0.56       214
weighted avg       0.78      0.64      0.69       214

[[116  61]
 [ 15  22]]


### Conclusion
TODO:

### Uso de class_weight pero con paquete que nos proporciona

In [24]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {0: class_weights[0], 1: class_weights[1]}
print(class_weights)

{0: 0.6025459688826026, 1: 2.9379310344827587}


In [25]:
epochs = 50

history = nn.fit(x_train, y_train, epochs=epochs, verbose=0, class_weight=class_weights, validation_data=(x_val, y_val))

y_pred_train = nn.predict(x_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.92      0.74      0.82       707
         1.0       0.35      0.70      0.47       145

    accuracy                           0.73       852
   macro avg       0.64      0.72      0.64       852
weighted avg       0.83      0.73      0.76       852

Test
              precision    recall  f1-score   support

         0.0       0.89      0.67      0.77       177
         1.0       0.28      0.59      0.38        37

    accuracy                           0.66       214
   macro avg       0.58      0.63      0.57       214
weighted avg       0.78      0.66      0.70       214

[[119  58]
 [ 15  22]]


### Conclusion
TODO:

### Aplicar todas las ténicas a la vez.


#### RandomUnderSampler + RandomOverSampler

In [26]:
print(Counter(y_train))

oversample = RandomOverSampler(sampling_strategy=0.4)
X_over_train, y_over_train = oversample.fit_resample(x_train, y_train)
print(Counter(y_over_train))

undersample = RandomUnderSampler(sampling_strategy=0.5)
X_ajust_train, y_ajust_train = undersample.fit_resample(X_over_train, y_over_train)
print(Counter(y_ajust_train))

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_ajust_train), y=y_ajust_train)
class_weights = {0: class_weights[0], 1: class_weights[1]}
print(class_weights)

Counter({0.0: 707, 1.0: 145})
Counter({0.0: 707, 1.0: 282})
Counter({0.0: 564, 1.0: 282})
{0: 0.75, 1: 1.5}


In [27]:
epochs = 50

history = nn.fit(X_ajust_train, y_ajust_train, epochs=epochs, verbose=0, class_weight=class_weights, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_ajust_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_ajust_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.82      0.73      0.77       564
         1.0       0.56      0.69      0.61       282

    accuracy                           0.71       846
   macro avg       0.69      0.71      0.69       846
weighted avg       0.73      0.71      0.72       846

Test
              precision    recall  f1-score   support

         0.0       0.89      0.67      0.77       177
         1.0       0.28      0.59      0.38        37

    accuracy                           0.66       214
   macro avg       0.58      0.63      0.57       214
weighted avg       0.78      0.66      0.70       214

[[119  58]
 [ 15  22]]


#### SMOTE + ENN

In [28]:
smoteen = SMOTEENN()
X_ajust_train, y_ajust_train = smoteen.fit_resample(x_train, y_train)
print(Counter(y_ajust_train))

Counter({1.0: 393, 0.0: 355})


In [29]:
epochs = 50

history = nn.fit(X_ajust_train, y_ajust_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_ajust_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_ajust_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.79      0.79      0.79       355
         1.0       0.81      0.81      0.81       393

    accuracy                           0.80       748
   macro avg       0.80      0.80      0.80       748
weighted avg       0.80      0.80      0.80       748

Test
              precision    recall  f1-score   support

         0.0       0.90      0.68      0.78       177
         1.0       0.29      0.62      0.40        37

    accuracy                           0.67       214
   macro avg       0.59      0.65      0.59       214
weighted avg       0.79      0.67      0.71       214

[[121  56]
 [ 14  23]]


#### SMOTE + Tomek

In [30]:
smtomek = SMOTETomek()
X_ajust_train, y_ajust_train = smtomek.fit_resample(x_train, y_train)
print(Counter(y_ajust_train))

Counter({0.0: 706, 1.0: 706})


In [31]:
epochs = 50

history = nn.fit(X_ajust_train, y_ajust_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_ajust_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_ajust_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.72      0.74      0.73       706
         1.0       0.74      0.72      0.73       706

    accuracy                           0.73      1412
   macro avg       0.73      0.73      0.73      1412
weighted avg       0.73      0.73      0.73      1412

Test
              precision    recall  f1-score   support

         0.0       0.90      0.68      0.78       177
         1.0       0.29      0.62      0.40        37

    accuracy                           0.67       214
   macro avg       0.59      0.65      0.59       214
weighted avg       0.79      0.67      0.71       214

[[121  56]
 [ 14  23]]


### Conclusion
TODO: