# Data Augmentation

Uno de los principales obstáculos que se encuentran al intentar hallar la major solución al problema de clasificación, es la escasez de datos para el entrenamiento de los modelos.

Este notebook tiene como objetivo implementar y comparar los resultados obtenidos implementando técnicas de Data Augmentation para crear datos sintéticos y poder operar sobre un mayor volumen de datos.

In [1]:
import pandas as pd
import numpy as np

# Preparación de los datos

In [2]:
# Datos de entrenamiento
trainFNC = pd.read_csv("data/train_FNC.csv")
trainSBM = pd.read_csv("data/train_SBM.csv")
train_labels = pd.read_csv("data/train_labels.csv")

# DataFrame con ambas fuentes de datos
train = pd.merge(left=trainFNC, right=trainSBM, left_on='Id', right_on='Id')
data = pd.merge(left=train_labels, right=train, left_on='Id', right_on='Id')
data.drop("Id", inplace=True, axis=1)

# Shuffle de los datos de train
data = data.sample(frac=1, random_state=0)
data.head(5)

Unnamed: 0,Class,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
2,0,0.24585,0.21662,-0.12468,-0.3538,0.1615,-0.002032,-0.13302,-0.035222,0.25904,...,-0.257114,0.597229,1.220756,-0.059213,-0.435494,-0.092971,1.09091,-0.448562,-0.508497,0.350434
13,1,0.41073,-0.031925,0.2107,0.24226,0.3201,-0.41929,-0.18714,0.16845,0.59979,...,-0.050862,0.870602,0.609465,1.181878,-2.279469,-0.013484,-0.012693,-1.244346,-1.080442,-0.788502
53,1,0.070919,0.034179,-0.011755,0.019158,0.024645,-0.032022,0.00462,0.31817,0.21255,...,-1.539922,-1.495822,1.643866,1.68778,1.521086,-1.988432,-0.267471,0.510576,1.104566,-1.067206
41,0,0.087377,-0.052462,-0.007835,-0.11283,0.38938,0.21608,0.063572,-0.25123,-0.080568,...,-0.077353,-0.459463,-0.204328,-0.619508,-1.410523,-0.304622,-1.521928,0.593691,0.073638,-0.26092
74,0,0.20275,0.19142,-0.056662,-0.15778,0.24404,0.03978,-0.001503,0.001056,-0.048222,...,0.044457,0.593326,1.063052,0.434726,1.604964,-0.359736,0.210107,0.355922,0.730287,-0.323557


In [3]:
labels = data.iloc[:, 0]
features = np.array(data.iloc[:, 1:])

In [4]:
from sklearn.model_selection import train_test_split

def data_partition(data_augmented):
    X = data_augmented.iloc[:, 1:]
    Y = data_augmented.iloc[:, 0]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    return (X_train, X_test, y_train, y_test)

**Precisión de un modelo de Random Forest sobre el conjunto original de datos**

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

(X_train, X_test, y_train, y_test) = data_partition(data)

model_RF = RandomForestClassifier(random_state=0)
param_grid_RF = {
    "n_estimators": [100, 250, 500, 750, 1000],
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 50, 100, 200, 400]
}
grid_search_RF = GridSearchCV(estimator=model_RF, param_grid=param_grid_RF, cv=4)
grid_search_RF.fit(X_train, y_train)
model_RF_opt = grid_search_RF.best_estimator_
# Predicción en partición de test
y_pred_RF = model_RF_opt.predict(X_test)
# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_RF)
print("Accuracy without Data Augmentation: {:0.2f}%".format(accuracy * 100))

Accuracy without Data Augmentation: 83.33%


# Ruido gaussiano

Una primera prueba de introducción de ruido artificial se basará en añadir a los datos originales, valores (ruido) que se tomarán de una distribución gaussiana de media = 0 y desviación típica = 10% del rango de valores para cada variable.

DATA AUGMENTATION: ruido (10%?, mirar por variable), librerías

In [6]:
# Para cada variable en el conjunto de datos, calculamos la desviación típica que usaremos (10% del intervalo)
max_per_var = features.max(axis=0)
min_per_var = features.min(axis=0)
std_per_var = (max_per_var - min_per_var) * 0.1
std_per_var.shape

(410,)

In [14]:
np.random.seed(0) # Reproducibilidad de resultados

def generate_noisy_sample(original_sample, data=data, std_per_var=std_per_var):
    noisy_sample = np.empty((len(std_per_var),))
    for j, var in enumerate(data.columns[1:]):
        noisy_sample[j] = original_sample[j] + np.random.normal(0, std_per_var[j])         
    return noisy_sample

# Para cada muestra conocida (y etiquetada), generaremos una muestra sintética con ruido
noisy_features = np.empty(features.shape)
for i, sample in enumerate(features):
    noisy_features[i, :] = generate_noisy_sample(sample)
    
# Volvemos a asignar las etiquetas correspondientes a cada fila
noisy_features = np.c_[labels, noisy_features]

noisy_data = pd.concat([data, pd.DataFrame(noisy_features, columns=data.columns)], axis=0)
# Shuffle de los datos con ruido
noisy_data = noisy_data.sample(frac=1, random_state=0)
print("Tamaño DataFrame original: {}".format(data.shape))
print("Tamaño DataFrame tras añadir el ruido: {}".format(noisy_data.shape))
noisy_data.head(10)

Tamaño DataFrame original: (86, 411)
Tamaño DataFrame tras añadir el ruido: (172, 411)


Unnamed: 0,Class,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
10,1.0,0.12467,-0.049878,-0.13066,-0.14185,-0.14849,-0.085769,-0.12771,-0.3123,-0.13607,...,-0.897078,0.359318,-0.435161,-0.541126,0.363668,-0.545821,-0.86845,0.367415,-0.038803,1.003364
60,1.0,0.066194,0.186641,-0.010201,0.309838,0.161224,0.505312,0.017686,-0.036332,0.05233,...,-0.082067,-1.465872,-0.759942,1.348701,-0.933341,0.215661,-0.94484,0.076907,1.297998,1.255526
19,1.0,-0.13851,0.25836,-0.51077,-0.45778,-0.4404,-0.34434,-0.28348,-0.38075,-0.33913,...,-0.038365,-0.896406,-1.232269,0.785572,-0.817785,-0.419049,-1.509745,-0.946138,0.259551,-1.026065
31,0.0,0.21678,0.27011,0.087964,-0.30365,0.59275,-0.17064,-0.14309,-0.4719,-0.18045,...,-1.527436,-0.014958,-0.682168,0.114895,1.148412,0.434126,1.69683,-0.43443,0.094033,-0.119103
39,1.0,0.038565,-0.575757,-0.512312,-0.647848,-0.61033,-0.163938,0.248598,0.521718,0.094848,...,1.832051,-1.133497,0.749306,-0.126126,0.754504,0.654096,0.776412,2.151346,3.334682,-0.377582
14,1.0,-0.10328,-0.035793,-0.076679,-0.143135,-0.04572,0.04664,0.329442,-0.210291,0.251205,...,0.415332,-0.75514,0.049985,1.12818,-1.001161,0.628155,1.752479,0.238701,0.098952,1.853017
43,0.0,0.38697,0.13922,-0.22102,-0.21681,-0.017924,0.13965,0.33349,0.41221,0.30668,...,1.012167,1.308194,-1.412457,1.490609,0.081146,0.564483,0.565845,-0.988205,0.014638,0.279144
69,0.0,-0.130397,-0.180256,-0.189844,0.196897,0.393398,-0.273719,0.088168,0.510149,0.21942,...,0.2378,0.788113,0.348554,1.047856,0.798735,1.318691,-2.225403,-0.130444,0.465368,0.092125
18,1.0,0.3737,0.195405,0.187181,0.276078,0.480664,0.0024,0.086827,-0.11112,-0.184121,...,1.269346,-1.205928,-0.375727,0.036175,0.213618,1.160196,0.059257,1.70066,-0.173049,-0.069488
3,0.0,0.213273,0.016551,-0.111725,0.148859,0.205018,0.027774,0.183647,-0.225526,-0.145957,...,0.018167,-0.549698,-0.00921,-0.49132,-1.561815,-0.450735,-1.953895,0.939853,-0.303034,-0.781962


Precisión del modelo:

In [15]:
(X_train, X_test, y_train, y_test) = data_partition(noisy_data)

grid_search_RF.fit(X_train, y_train)
model_RF_opt = grid_search_RF.best_estimator_
# Predicción en partición de test
y_pred_RF = model_RF_opt.predict(X_test)
# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_RF)
print("Accuracy with Data Augmentation: {:0.2f}%".format(accuracy * 100))

Accuracy with Data Augmentation: 94.29%
