In [None]:
# !pip3 install -U keras-tuner
# !pip3 install tensorflow
# !pip3 install imblearn

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import keras as k
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report, confusion_matrix
import json

In [2]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [3]:
dataset_url = 'https://www.openml.org/data/get_csv/4965303/flare.arff' 
dataset = np.genfromtxt(dataset_url, delimiter=',', skip_header=1)

x = dataset[:,:-4]
x_size = x.shape[1]
y = dataset[:,-1]

In [4]:
x.shape

(1066, 7)

### <font color="#CA3532">Definición del modelo</font>


In [5]:
# Definid el modelo con Keras

nn = Sequential()

### -------------------------------------------------------------------------------
### Añadir la capas completamente conectadas que consideréis al modelo
### -------------------------------------------------------------------------------
nn.add(Dense(12, activation="sigmoid"))
nn.add(Dense(12, activation="sigmoid"))
nn.add(Dense(1, activation="sigmoid"))

In [6]:
metrics = [
            keras.metrics.BinaryAccuracy(name='ACC'),
            keras.metrics.Precision(name='Prec'),
            keras.metrics.Recall(name='Rec'),
            keras.metrics.AUC(name='AUC'),
          ]

nn.compile(optimizer='SGD', loss="mse", metrics=metrics)
# nn.compile(optimizer='Adam', loss="binary_crossentropy", metrics=metrics)

### <font color="#CA3532">Conjuntos de entrenamiento y validación</font>

In [7]:
# Contrucción de los conjuntos de entrenamiento y validación

from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, stratify=y, test_size=0.2) 

### <font color="#CA3532">Visualización de resultados</font>

In [8]:
from sklearn.metrics import classification_report, confusion_matrix

def show_metrics(history):
    for metric in history.history.keys():
        if not metric.startswith('val_'):
            plt.plot(history.history[metric], label=metric)
            plt.plot(history.history[f'val_{metric}'], label=f'val_{metric}')
            plt.title(metric)
            plt.ylabel('')
            plt.xlabel('Epoch')
            plt.legend(loc="upper left")
            plt.show()

### <font color="#CA3532">Entrenamiento de la red neuronal y evaluación</font>
Como podrás haber observado en problema no está balanceado (o está bastante desequilibrado), porque el número de ejemplos de cada clase es muy diferente.

In [9]:
# !pip3 install imbalanced-learn



In [10]:
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.metrics import f1_score

#### RandomUnderSampler

In [11]:
undersample = RandomUnderSampler()
X_under_train, y_under_train = undersample.fit_resample(x_train, y_train)
print(Counter(y_under_train))

Counter({0.0: 145, 1.0: 145})


In [12]:
epochs = 50
history = nn.fit(X_under_train, y_under_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_under_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_under_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))

print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       145
         1.0       0.50      1.00      0.67       145

    accuracy                           0.50       290
   macro avg       0.25      0.50      0.33       290
weighted avg       0.25      0.50      0.33       290

Test
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       177
         1.0       0.17      1.00      0.29        37

    accuracy                           0.17       214
   macro avg       0.09      0.50      0.15       214
weighted avg       0.03      0.17      0.05       214

[[  0 177]
 [  0  37]]


#### Resultado
TODO:

#### EditedNearestNeighbours


In [13]:
enn = EditedNearestNeighbours()
X_under_train, y_under_train = enn.fit_resample(x_train, y_train)
print(Counter(y_under_train))

Counter({0.0: 504, 1.0: 145})


In [14]:
epochs = 50
history = nn.fit(X_under_train, y_under_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_under_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_under_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.78      1.00      0.87       504
         1.0       0.00      0.00      0.00       145

    accuracy                           0.78       649
   macro avg       0.39      0.50      0.44       649
weighted avg       0.60      0.78      0.68       649

Test
              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91       177
         1.0       0.00      0.00      0.00        37

    accuracy                           0.83       214
   macro avg       0.41      0.50      0.45       214
weighted avg       0.68      0.83      0.75       214

[[177   0]
 [ 37   0]]


#### Resultado
TODO:

#### TomekLinks

In [15]:
tl = TomekLinks()
X_under_train, y_under_train = tl.fit_resample(x_train, y_train)
print(Counter(y_under_train))

Counter({0.0: 704, 1.0: 145})


In [16]:
epochs = 50
history = nn.fit(X_under_train, y_under_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_under_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_under_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91       704
         1.0       0.00      0.00      0.00       145

    accuracy                           0.83       849
   macro avg       0.41      0.50      0.45       849
weighted avg       0.69      0.83      0.75       849

Test
              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91       177
         1.0       0.00      0.00      0.00        37

    accuracy                           0.83       214
   macro avg       0.41      0.50      0.45       214
weighted avg       0.68      0.83      0.75       214

[[177   0]
 [ 37   0]]


#### Resultado
TODO:

#### RandomOverSampler

In [17]:
oversample = RandomOverSampler()
X_over_train, y_over_train = oversample.fit_resample(x_train, y_train)
print(Counter(y_over_train))

Counter({0.0: 707, 1.0: 707})


In [18]:
epochs = 50
history = nn.fit(X_over_train, y_over_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_over_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_over_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.65      0.78      0.71       707
         1.0       0.72      0.59      0.65       707

    accuracy                           0.68      1414
   macro avg       0.69      0.68      0.68      1414
weighted avg       0.69      0.68      0.68      1414

Test
              precision    recall  f1-score   support

         0.0       0.87      0.79      0.82       177
         1.0       0.30      0.43      0.35        37

    accuracy                           0.72       214
   macro avg       0.58      0.61      0.59       214
weighted avg       0.77      0.72      0.74       214

[[139  38]
 [ 21  16]]


#### Resultado
TODO:

#### SMOTE

In [19]:
sm = SMOTE()
X_over_train, y_over_train = sm.fit_resample(x_train, y_train)
print(Counter(y_over_train))

Counter({0.0: 707, 1.0: 707})


In [20]:
epochs = 50
history = nn.fit(X_over_train, y_over_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_over_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_over_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.64      0.78      0.70       707
         1.0       0.72      0.56      0.63       707

    accuracy                           0.67      1414
   macro avg       0.68      0.67      0.67      1414
weighted avg       0.68      0.67      0.67      1414

Test
              precision    recall  f1-score   support

         0.0       0.87      0.79      0.82       177
         1.0       0.30      0.43      0.35        37

    accuracy                           0.72       214
   macro avg       0.58      0.61      0.59       214
weighted avg       0.77      0.72      0.74       214

[[139  38]
 [ 21  16]]


#### Resultado
TODO:

#### ADASYN

In [21]:
ada = ADASYN()
X_over_train, y_over_train = ada.fit_resample(x_train, y_train)
print(Counter(y_over_train))

Counter({0.0: 707, 1.0: 707})


In [22]:
epochs = 50
history = nn.fit(X_over_train, y_over_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_over_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_over_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.59      0.77      0.67       707
         1.0       0.66      0.46      0.55       707

    accuracy                           0.61      1414
   macro avg       0.63      0.61      0.61      1414
weighted avg       0.63      0.61      0.61      1414

Test
              precision    recall  f1-score   support

         0.0       0.87      0.77      0.82       177
         1.0       0.29      0.43      0.34        37

    accuracy                           0.71       214
   macro avg       0.58      0.60      0.58       214
weighted avg       0.77      0.71      0.74       214

[[137  40]
 [ 21  16]]


#### Resultado
TODO:

### Uso de class_weight pero con paquete que nos proporciona

In [23]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {0: class_weights[0], 1: class_weights[1]}
print(class_weights)

{0: 0.6025459688826026, 1: 2.9379310344827587}


In [24]:
epochs = 50

history = nn.fit(x_train, y_train, epochs=epochs, verbose=0, class_weight=class_weights, validation_data=(x_val, y_val))

y_pred_train = nn.predict(x_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.90      0.77      0.83       707
         1.0       0.34      0.57      0.42       145

    accuracy                           0.74       852
   macro avg       0.62      0.67      0.63       852
weighted avg       0.80      0.74      0.76       852

Test
              precision    recall  f1-score   support

         0.0       0.87      0.78      0.82       177
         1.0       0.29      0.43      0.35        37

    accuracy                           0.72       214
   macro avg       0.58      0.61      0.58       214
weighted avg       0.77      0.72      0.74       214

[[138  39]
 [ 21  16]]


#### Resultado
TODO:

### Aplicar todas las ténicas a la vez.


#### RandomUnderSampler + RandomOverSampler

In [25]:
print(Counter(y_train))

oversample = RandomOverSampler(sampling_strategy=0.4)
X_over_train, y_over_train = oversample.fit_resample(x_train, y_train)
print(Counter(y_over_train))

undersample = RandomUnderSampler(sampling_strategy=0.5)
X_ajust_train, y_ajust_train = undersample.fit_resample(X_over_train, y_over_train)
print(Counter(y_ajust_train))

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_ajust_train), y=y_ajust_train)
class_weights = {0: class_weights[0], 1: class_weights[1]}
print(class_weights)

Counter({0.0: 707, 1.0: 145})
Counter({0.0: 707, 1.0: 282})
Counter({0.0: 564, 1.0: 282})
{0: 0.75, 1: 1.5}


In [26]:
epochs = 50

history = nn.fit(X_ajust_train, y_ajust_train, epochs=epochs, verbose=0, class_weight=class_weights, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_ajust_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_ajust_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.79      0.78      0.79       564
         1.0       0.57      0.59      0.58       282

    accuracy                           0.72       846
   macro avg       0.68      0.69      0.68       846
weighted avg       0.72      0.72      0.72       846

Test
              precision    recall  f1-score   support

         0.0       0.87      0.78      0.82       177
         1.0       0.29      0.43      0.35        37

    accuracy                           0.72       214
   macro avg       0.58      0.61      0.58       214
weighted avg       0.77      0.72      0.74       214

[[138  39]
 [ 21  16]]


#### Resultado
TODO:

#### SMOTE + ENN

In [27]:
smoteen = SMOTEENN()
X_ajust_train, y_ajust_train = smoteen.fit_resample(x_train, y_train)
print(Counter(y_ajust_train))

Counter({0.0: 501, 1.0: 313})


In [28]:
epochs = 50

history = nn.fit(X_ajust_train, y_ajust_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_ajust_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_ajust_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.80      0.85      0.83       501
         1.0       0.74      0.66      0.70       313

    accuracy                           0.78       814
   macro avg       0.77      0.76      0.76       814
weighted avg       0.78      0.78      0.78       814

Test
              precision    recall  f1-score   support

         0.0       0.86      0.81      0.83       177
         1.0       0.28      0.35      0.31        37

    accuracy                           0.73       214
   macro avg       0.57      0.58      0.57       214
weighted avg       0.76      0.73      0.74       214

[[144  33]
 [ 24  13]]


#### Resultado
TODO:

#### SMOTE + Tomek

In [29]:
smtomek = SMOTETomek()
X_ajust_train, y_ajust_train = smtomek.fit_resample(x_train, y_train)
print(Counter(y_ajust_train))

Counter({0.0: 707, 1.0: 707})


In [30]:
epochs = 50

history = nn.fit(X_ajust_train, y_ajust_train, epochs=epochs, verbose=0, validation_data=(x_val, y_val))

y_pred_train = nn.predict(X_ajust_train)
y_pred_train = y_pred_train > 0.5
y_pred = nn.predict(x_val)
y_pred = y_pred > 0.5

# show_metrics(history)
print("Train")
print(classification_report(y_ajust_train, y_pred_train, zero_division=0))
print("Test")
print(classification_report(y_val, y_pred, zero_division=0))
print(confusion_matrix(y_val, y_pred))

Train
              precision    recall  f1-score   support

         0.0       0.66      0.76      0.70       707
         1.0       0.71      0.61      0.66       707

    accuracy                           0.68      1414
   macro avg       0.69      0.68      0.68      1414
weighted avg       0.69      0.68      0.68      1414

Test
              precision    recall  f1-score   support

         0.0       0.87      0.77      0.82       177
         1.0       0.29      0.46      0.36        37

    accuracy                           0.71       214
   macro avg       0.58      0.61      0.59       214
weighted avg       0.77      0.71      0.74       214

[[136  41]
 [ 20  17]]


#### Resultado
TODO: