# Data Augmentation

Uno de los principales obstáculos que se encuentran al intentar hallar la major solución al problema de clasificación, es la escasez de datos para el entrenamiento de los modelos.

Este notebook tiene como objetivo implementar y comparar los resultados obtenidos implementando técnicas de Data Augmentation para crear datos sintéticos y poder operar sobre un mayor volumen de datos.

In [1]:
import pandas as pd
import numpy as np

# Preparación de los datos

In [2]:
# Datos de entrenamiento
trainFNC = pd.read_csv("data/train_FNC.csv")
trainSBM = pd.read_csv("data/train_SBM.csv")
train_labels = pd.read_csv("data/train_labels.csv")

# DataFrame con ambas fuentes de datos
train = pd.merge(left=trainFNC, right=trainSBM, left_on='Id', right_on='Id')
data = pd.merge(left=train_labels, right=train, left_on='Id', right_on='Id')
data.drop("Id", inplace=True, axis=1)

# Shuffle de los datos de train
data = data.sample(frac=1, random_state=0)
data.head(5)

Unnamed: 0,Class,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
2,0,0.24585,0.21662,-0.12468,-0.3538,0.1615,-0.002032,-0.13302,-0.035222,0.25904,...,-0.257114,0.597229,1.220756,-0.059213,-0.435494,-0.092971,1.09091,-0.448562,-0.508497,0.350434
13,1,0.41073,-0.031925,0.2107,0.24226,0.3201,-0.41929,-0.18714,0.16845,0.59979,...,-0.050862,0.870602,0.609465,1.181878,-2.279469,-0.013484,-0.012693,-1.244346,-1.080442,-0.788502
53,1,0.070919,0.034179,-0.011755,0.019158,0.024645,-0.032022,0.00462,0.31817,0.21255,...,-1.539922,-1.495822,1.643866,1.68778,1.521086,-1.988432,-0.267471,0.510576,1.104566,-1.067206
41,0,0.087377,-0.052462,-0.007835,-0.11283,0.38938,0.21608,0.063572,-0.25123,-0.080568,...,-0.077353,-0.459463,-0.204328,-0.619508,-1.410523,-0.304622,-1.521928,0.593691,0.073638,-0.26092
74,0,0.20275,0.19142,-0.056662,-0.15778,0.24404,0.03978,-0.001503,0.001056,-0.048222,...,0.044457,0.593326,1.063052,0.434726,1.604964,-0.359736,0.210107,0.355922,0.730287,-0.323557


In [3]:
from sklearn.model_selection import train_test_split

def data_partition(data_augmented):
    X = data_augmented.iloc[:, 1:]
    Y = data_augmented.iloc[:, 0]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    return (X_train, X_test, y_train, y_test)

In [4]:
(X_train, X_test, y_train, y_test) = data_partition(data)

In [5]:
# Datos de test
testFNC = pd.read_csv("data/test_FNC.csv")
testSBM = pd.read_csv("data/test_SBM.csv")

# DataFrame con ambas fuentes de datos
test = pd.merge(left=testFNC, right=testSBM, left_on='Id', right_on='Id')
test.drop("Id", inplace=True, axis=1)
test.head(5)

Unnamed: 0,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,FNC10,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
0,0.476127,0.064466,0.053238,-0.608133,0.073988,-0.637038,0.113556,-0.192434,-0.004025,-0.060474,...,-0.451994,1.12377,2.083006,1.14544,-0.067608,1.202529,0.851587,0.451583,-0.159739,0.192076
1,0.013833,0.267183,0.232178,-0.167151,-0.261327,0.191869,0.406493,0.088761,0.177048,0.036718,...,0.696987,1.397832,1.046136,-0.191733,-2.192023,-0.369276,0.822225,-0.109342,-0.580476,0.17416
2,-0.435452,0.04678,0.243742,0.39703,-0.147821,0.17362,-0.461963,-0.610736,0.419753,0.400985,...,0.160145,1.906989,-2.661633,-0.193911,0.440873,0.641739,0.918397,-0.758046,0.154701,-0.476647
3,-0.20451,-0.036735,-0.760705,-0.740495,0.064668,0.349926,-0.273826,-0.174384,-0.120248,0.175618,...,0.974828,-1.997087,-2.083782,1.154107,-0.643947,2.332424,0.659124,-0.809445,0.55896,2.790871
4,0.599435,-0.166441,0.122431,0.011539,0.346906,-0.01743,-0.274734,0.21151,0.151012,-0.033434,...,-0.789153,1.578984,1.402592,-1.23044,0.296686,2.806314,0.427184,-0.240682,-0.196948,-1.544345


In [6]:
labels = data.iloc[:, 0]
# features = np.array(data.iloc[:, 1:])
features = np.array(data.iloc[:, 1:])
features_tot = np.concatenate((features, test))
print(features_tot.shape)

(119834, 410)


**Precisión de un modelo de Random Forest sobre el conjunto original de datos**

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [8]:
model_RF = RandomForestClassifier(random_state=0)
param_grid_RF = {
    "n_estimators": [100, 250, 500, 750, 1000],
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 50, 100, 200, 400]
}
grid_search_RF = GridSearchCV(estimator=model_RF, param_grid=param_grid_RF, cv=4)

grid_search_RF.fit(X_train, y_train)
model_RF_opt = grid_search_RF.best_estimator_
# Predicción en partición de test
y_pred_RF = model_RF_opt.predict(X_test)
# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_RF)
print("Accuracy without Data Augmentation: {:0.2f}%".format(accuracy * 100))

Accuracy without Data Augmentation: 83.33%


# Random noise

### Ruigo gaussiano

Una primera prueba de introducción de ruido artificial se basará en añadir a los datos originales, valores (ruido) que se tomarán de una distribución gaussiana de media = 0 y desviación típica = 10% del rango de valores para cada variable.

In [12]:
# Para cada variable en el conjunto de datos, calculamos la desviación típica que usaremos (10% del intervalo)
max_per_var = features_tot.max(axis=0)
min_per_var = features_tot.min(axis=0)
std_per_var = (max_per_var - min_per_var) * 0.1
std_per_var.shape

(410,)

In [10]:
np.random.seed(0) # Reproducibilidad de resultados

def generate_noisy_sample_gaussian(original_sample, data=data, std_per_var=std_per_var):
    noisy_sample = np.empty((len(std_per_var),))
    for j, var in enumerate(data.columns[1:]):
        noisy_sample[j] = original_sample[j] + np.random.normal(0, std_per_var[j])         
    return noisy_sample

# Para cada muestra conocida (y etiquetada), generaremos una muestra sintética con ruido
noisy_features_gaussian = np.empty(features.shape)
for i, sample in enumerate(features):
    noisy_features_gaussian[i, :] = generate_noisy_sample_gaussian(sample)
    
# Volvemos a asignar las etiquetas correspondientes a cada fila
noisy_features_gaussian = np.c_[labels, noisy_features_gaussian]

noisy_data_gaussian = pd.concat([data, pd.DataFrame(noisy_features_gaussian, columns=data.columns)], axis=0)
# Shuffle de los datos con ruido
noisy_data_gaussian = noisy_data_gaussian.sample(frac=1, random_state=0)
print("Tamaño DataFrame original: {}".format(data.shape))
print("Tamaño DataFrame tras añadir el ruido: {}".format(noisy_data_gaussian.shape))
noisy_data_gaussian.head(10)

Tamaño DataFrame original: (86, 411)
Tamaño DataFrame tras añadir el ruido: (172, 411)


Unnamed: 0,Class,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
10,1.0,0.12467,-0.049878,-0.13066,-0.14185,-0.14849,-0.085769,-0.12771,-0.3123,-0.13607,...,-0.897078,0.359318,-0.435161,-0.541126,0.363668,-0.545821,-0.86845,0.367415,-0.038803,1.003364
60,1.0,0.09031,0.186641,-0.033887,0.30355,0.14426,0.53587,0.033004,-0.066027,0.068281,...,-0.064109,-1.416375,-0.588373,1.476591,-0.93143,0.215681,-1.13886,-0.119312,1.291117,1.465796
19,1.0,-0.13851,0.25836,-0.51077,-0.45778,-0.4404,-0.34434,-0.28348,-0.38075,-0.33913,...,-0.038365,-0.896406,-1.232269,0.785572,-0.817785,-0.419049,-1.509745,-0.946138,0.259551,-1.026065
31,0.0,0.21678,0.27011,0.087964,-0.30365,0.59275,-0.17064,-0.14309,-0.4719,-0.18045,...,-1.527436,-0.014958,-0.682168,0.114895,1.148412,0.434126,1.69683,-0.43443,0.094033,-0.119103
39,1.0,0.089888,-0.575757,-0.510545,-0.655821,-0.626826,-0.183842,0.286757,0.562557,0.113185,...,1.879741,-1.258914,1.126362,-0.268733,0.739928,0.638815,0.934721,2.100384,3.440071,-0.320113
14,1.0,-0.149488,-0.035793,-0.072892,-0.134532,-0.078967,0.029238,0.334085,-0.236176,0.241712,...,0.438323,-1.170183,0.237547,1.10583,-0.988333,0.614596,2.344493,0.214602,0.175995,2.29487
43,0.0,0.38697,0.13922,-0.22102,-0.21681,-0.017924,0.13965,0.33349,0.41221,0.30668,...,1.012167,1.308194,-1.412457,1.490609,0.081146,0.564483,0.565845,-0.988205,0.014638,0.279144
69,0.0,-0.173938,-0.180256,-0.183904,0.212437,0.408138,-0.287515,0.102173,0.541885,0.202692,...,0.215764,0.507154,0.734908,1.163661,0.895043,1.35835,-2.743792,-0.051985,0.379716,0.015072
18,1.0,0.385621,0.195405,0.183701,0.274933,0.49169,-0.01983,0.095519,-0.024712,-0.187834,...,1.321553,-1.59588,-0.21903,0.159802,0.080166,1.098231,-0.252857,1.797618,-0.175905,-0.074648
3,0.0,0.248241,0.016551,-0.122858,0.163344,0.174548,-2.5e-05,0.201017,-0.219028,-0.159444,...,0.027468,-0.62267,0.115314,-0.473325,-1.615781,-0.4771,-2.181088,1.033173,-0.351526,-0.954194


In [21]:
np.random.seed(0) # Reproducibilidad de resultados

def generate_noisy_sample_gaussian(original_sample, data=data, std_per_var=std_per_var):
    noisy_sample = np.empty((len(std_per_var),))
    for j, var in enumerate(data.columns[1:]):
        noisy_sample[j] = original_sample[j] + np.random.normal(0, std_per_var[j])         
    return noisy_sample

# Para cada muestra conocida (y etiquetada), generaremos una muestra sintética con ruido
noisy_features_gaussian = np.empty(features.shape)
for i, sample in enumerate(features):
    noisy_features_gaussian[i, :] = generate_noisy_sample_gaussian(sample)
    
# Volvemos a asignar las etiquetas correspondientes a cada fila
noisy_features_gaussian = np.c_[labels, noisy_features_gaussian]

noisy_data_gaussian = pd.concat([data, pd.DataFrame(noisy_features_gaussian, columns=data.columns)], axis=0)
# Shuffle de los datos con ruido
noisy_data_gaussian = noisy_data_gaussian.sample(frac=1, random_state=0)
print("Tamaño DataFrame original: {}".format(data.shape))
print("Tamaño DataFrame tras añadir el ruido: {}".format(noisy_data_gaussian.shape))
noisy_data_gaussian.head(10)

Tamaño DataFrame original: (86, 411)
Tamaño DataFrame tras añadir el ruido: (172, 411)


Unnamed: 0,Class,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
10,1.0,0.12467,-0.049878,-0.13066,-0.14185,-0.14849,-0.085769,-0.12771,-0.3123,-0.13607,...,-0.897078,0.359318,-0.435161,-0.541126,0.363668,-0.545821,-0.86845,0.367415,-0.038803,1.003364
60,1.0,0.066194,0.186641,-0.010201,0.309838,0.161224,0.505312,0.017686,-0.036332,0.05233,...,-0.082067,-1.465872,-0.759942,1.348701,-0.933341,0.215661,-0.94484,0.076907,1.297998,1.255526
19,1.0,-0.13851,0.25836,-0.51077,-0.45778,-0.4404,-0.34434,-0.28348,-0.38075,-0.33913,...,-0.038365,-0.896406,-1.232269,0.785572,-0.817785,-0.419049,-1.509745,-0.946138,0.259551,-1.026065
31,0.0,0.21678,0.27011,0.087964,-0.30365,0.59275,-0.17064,-0.14309,-0.4719,-0.18045,...,-1.527436,-0.014958,-0.682168,0.114895,1.148412,0.434126,1.69683,-0.43443,0.094033,-0.119103
39,1.0,0.038565,-0.575757,-0.512312,-0.647848,-0.61033,-0.163938,0.248598,0.521718,0.094848,...,1.832051,-1.133497,0.749306,-0.126126,0.754504,0.654096,0.776412,2.151346,3.334682,-0.377582
14,1.0,-0.10328,-0.035793,-0.076679,-0.143135,-0.04572,0.04664,0.329442,-0.210291,0.251205,...,0.415332,-0.75514,0.049985,1.12818,-1.001161,0.628155,1.752479,0.238701,0.098952,1.853017
43,0.0,0.38697,0.13922,-0.22102,-0.21681,-0.017924,0.13965,0.33349,0.41221,0.30668,...,1.012167,1.308194,-1.412457,1.490609,0.081146,0.564483,0.565845,-0.988205,0.014638,0.279144
69,0.0,-0.130397,-0.180256,-0.189844,0.196897,0.393398,-0.273719,0.088168,0.510149,0.21942,...,0.2378,0.788113,0.348554,1.047856,0.798735,1.318691,-2.225403,-0.130444,0.465368,0.092125
18,1.0,0.3737,0.195405,0.187181,0.276078,0.480664,0.0024,0.086827,-0.11112,-0.184121,...,1.269346,-1.205928,-0.375727,0.036175,0.213618,1.160196,0.059257,1.70066,-0.173049,-0.069488
3,0.0,0.213273,0.016551,-0.111725,0.148859,0.205018,0.027774,0.183647,-0.225526,-0.145957,...,0.018167,-0.549698,-0.00921,-0.49132,-1.561815,-0.450735,-1.953895,0.939853,-0.303034,-0.781962


Precisión del modelo:

In [11]:
(X_train, X_test, y_train, y_test) = data_partition(noisy_data_gaussian)

model_RF = RandomForestClassifier(random_state=0)
param_grid_RF = {
    "n_estimators": [100, 250, 500, 750, 1000],
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 50, 100, 200, 400]
}
grid_search_RF = GridSearchCV(estimator=model_RF, param_grid=param_grid_RF, cv=4)

grid_search_RF.fit(X_train, y_train)
model_RF_opt = grid_search_RF.best_estimator_
# Predicción en partición de test
y_pred_RF = model_RF_opt.predict(X_test)
# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_RF)
print("Accuracy with Data Augmentation: {:0.2f}%".format(accuracy * 100))

Accuracy with Data Augmentation: 91.43%


Vamos a modificar ahora el método, de modo que los valores generados de ruido se tomen de una distribución gaussiana de media = 0 y desviación típica = 10% del rango de la diferencia entre la media de las variables para pacientes con esquizofrenia y la media de las variables de los individuos de control.

La motivación para introducir esta modificación es que podría ser que las distribuciones que definen cada variable sean diferentes cuando se considera a un individuo sano (etiqueta 0) y a un individuo enfermo (etiqueta 1). Si estas distribuciones están lo suficientemente cercanas, introducir un ruido aparentemente pequeño podría modificar una muestra originalmente correspondiente a una clase y generar otra de manera artificial con la misma etiqueta pero que se solapa con la distribución de la etiqueta opuesta.

In [16]:
control_group = data[data["Class"] == 0]
sick = data[data["Class"] == 0]

labels_control = control_group.iloc[:, 0]
features_control = np.array(control_group.iloc[:, 1:])
labels_sick = sick.iloc[:, 0]
features_sick = np.array(sick.iloc[:, 1:])

# Para cada variable en el conjunto de datos, calculamos la desviación típica que usaremos (10% del intervalo)
avg_per_var_control = features_control.mean(axis=0)
avg_per_var_sick = features_sick.mean(axis=0)
std_per_var = abs((avg_per_var_control - avg_per_var_sick)) * 0.1

In [17]:
np.random.seed(0) # Reproducibilidad de resultados

def generate_noisy_sample_gaussian2(original_sample, data=data, std_per_var=std_per_var):
    noisy_sample = np.empty((len(std_per_var),))
    for j, var in enumerate(data.columns[1:]):
        noisy_sample[j] = original_sample[j] + np.random.normal(0, std_per_var[j])         
    return noisy_sample

# Para cada muestra conocida (y etiquetada), generaremos una muestra sintética con ruido
noisy_features_gaussian_2 = np.empty(features.shape)
for i, sample in enumerate(features):
    noisy_features_gaussian_2[i, :] = generate_noisy_sample_gaussian2(sample)
    
# Volvemos a asignar las etiquetas correspondientes a cada fila
noisy_features_gaussian_2 = np.c_[labels, noisy_features_gaussian_2]

noisy_data_gaussian_2 = pd.concat([data, pd.DataFrame(noisy_features_gaussian_2, columns=data.columns)], axis=0)
# Shuffle de los datos con ruido
noisy_data_gaussian_2 = noisy_data_gaussian_2.sample(frac=1, random_state=0)
print("Tamaño DataFrame original: {}".format(data.shape))
print("Tamaño DataFrame tras añadir el ruido: {}".format(noisy_data_gaussian_2.shape))
noisy_data_gaussian_2.head(10)

Tamaño DataFrame original: (86, 411)
Tamaño DataFrame tras añadir el ruido: (172, 411)


Unnamed: 0,Class,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
10,1.0,0.12467,-0.049878,-0.13066,-0.14185,-0.14849,-0.085769,-0.12771,-0.3123,-0.13607,...,-0.897078,0.359318,-0.435161,-0.541126,0.363668,-0.545821,-0.86845,0.367415,-0.038803,1.003364
60,1.0,-0.02063,0.2501,0.21083,0.42343,0.26387,0.29831,-0.088201,0.081132,-0.025001,...,-0.266496,-1.527078,-1.028776,0.437655,-0.9387,0.215549,-0.575946,0.804762,1.351451,0.619411
19,1.0,-0.13851,0.25836,-0.51077,-0.45778,-0.4404,-0.34434,-0.28348,-0.38075,-0.33913,...,-0.038365,-0.896406,-1.232269,0.785572,-0.817785,-0.419049,-1.509745,-0.946138,0.259551,-1.026065
31,0.0,0.21678,0.27011,0.087964,-0.30365,0.59275,-0.17064,-0.14309,-0.4719,-0.18045,...,-1.527436,-0.014958,-0.682168,0.114895,1.148412,0.434126,1.69683,-0.43443,0.094033,-0.119103
39,1.0,-0.14621,-0.46863,-0.5288,-0.50381,-0.51052,-0.029113,-0.015192,0.36017,0.005944,...,1.342273,-0.978412,0.158492,0.889753,0.795368,0.738788,0.475415,2.340384,2.516038,-0.55144
14,1.0,0.06308,-0.18202,-0.11202,-0.29856,0.15545,0.16452,0.29735,-0.1079,0.29723,...,0.179212,-0.241914,-0.243907,1.287397,-1.037125,0.703299,0.626871,0.328094,-0.499501,0.516312
43,0.0,0.38697,0.13922,-0.22102,-0.21681,-0.017924,0.13965,0.33349,0.41221,0.30668,...,1.012167,1.308194,-1.412457,1.490609,0.081146,0.564483,0.565845,-0.988205,0.014638,0.279144
69,0.0,0.026362,-0.19651,-0.24528,-0.08384,0.30421,-0.18027,-0.008647,0.38461,0.30052,...,0.464108,1.135537,-0.256829,0.222902,0.528734,1.098908,-1.239779,-0.42148,1.1307,0.325227
18,1.0,0.33078,0.12358,0.21965,0.29676,0.41395,0.15298,0.02674,-0.45292,-0.16612,...,0.733179,-0.723728,-0.621257,-0.844496,0.587749,1.503607,0.652687,1.341002,-0.150862,-0.053879
3,0.0,0.087377,-0.052462,-0.007835,-0.11283,0.38938,0.21608,0.063572,-0.25123,-0.080568,...,-0.077353,-0.459463,-0.204328,-0.619508,-1.410523,-0.304622,-1.521928,0.593691,0.073638,-0.26092


In [18]:
(X_train, X_test, y_train, y_test) = data_partition(noisy_data_gaussian_2)

model_RF = RandomForestClassifier(random_state=0)
param_grid_RF = {
    "n_estimators": [100, 250, 500, 750, 1000],
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 50, 100, 200, 400]
}
grid_search_RF = GridSearchCV(estimator=model_RF, param_grid=param_grid_RF, cv=4)

grid_search_RF.fit(X_train, y_train)
model_RF_opt = grid_search_RF.best_estimator_
# Predicción en partición de test
y_pred_RF = model_RF_opt.predict(X_test)
# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_RF)
print("Accuracy with Data Augmentation: {:0.2f}%".format(accuracy * 100))

Accuracy with Data Augmentation: 100.00%


### Ruido uniforme

In [19]:
avg_per_var = features_tot.mean(axis=0)

In [20]:
np.random.seed(0) # Reproducibilidad de resultados

def generate_noisy_sample_uniform(original_sample, data=data, std_per_var=std_per_var, 
                          min_per_var=min_per_var, max_per_var=max_per_var):
    noisy_sample = np.empty((len(std_per_var),))
    for j, var in enumerate(data.columns[1:]):
        noisy_sample[j] = original_sample[j] + np.random.uniform(avg_per_var[j]-std_per_var[j], avg_per_var[j]+std_per_var[j])         
    return noisy_sample

# Para cada muestra conocida (y etiquetada), generaremos una muestra sintética con ruido
noisy_features_uniform = np.empty(features.shape)
for i, sample in enumerate(features):
    noisy_features_uniform[i, :] = generate_noisy_sample_uniform(sample)
    
# Volvemos a asignar las etiquetas correspondientes a cada fila
noisy_features_uniform = np.c_[labels, noisy_features_uniform]

noisy_data_uniform = pd.concat([data, pd.DataFrame(noisy_features_uniform, columns=data.columns)], axis=0)
# Shuffle de los datos con ruido
noisy_data_uniform = noisy_data_uniform.sample(frac=1, random_state=0)
print("Tamaño DataFrame original: {}".format(data.shape))
print("Tamaño DataFrame tras añadir el ruido: {}".format(noisy_data_uniform.shape))
noisy_data_uniform.head(10)

Tamaño DataFrame original: (86, 411)
Tamaño DataFrame tras añadir el ruido: (172, 411)


Unnamed: 0,Class,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
10,1.0,0.12467,-0.049878,-0.13066,-0.14185,-0.14849,-0.085769,-0.12771,-0.3123,-0.13607,...,-0.897078,0.359318,-0.435161,-0.541126,0.363668,-0.545821,-0.86845,0.367415,-0.038803,1.003364
60,1.0,0.202288,0.372532,0.15038,0.407759,0.447514,0.353758,-0.188391,0.03045,0.039601,...,-0.341341,-1.454827,-1.091146,0.337247,-0.959441,0.358669,-0.524144,0.914417,1.350999,0.751747
19,1.0,-0.13851,0.25836,-0.51077,-0.45778,-0.4404,-0.34434,-0.28348,-0.38075,-0.33913,...,-0.038365,-0.896406,-1.232269,0.785572,-0.817785,-0.419049,-1.509745,-0.946138,0.259551,-1.026065
31,0.0,0.21678,0.27011,0.087964,-0.30365,0.59275,-0.17064,-0.14309,-0.4719,-0.18045,...,-1.527436,-0.014958,-0.682168,0.114895,1.148412,0.434126,1.69683,-0.43443,0.094033,-0.119103
39,1.0,0.076708,-0.346198,-0.58925,-0.519481,-0.326876,0.026335,-0.115382,0.309488,0.070546,...,1.267428,-0.906161,0.096122,0.789345,0.774628,0.881908,0.527217,2.450039,2.515586,-0.419105
14,1.0,0.285998,-0.059588,-0.17247,-0.314231,0.339094,0.219968,0.19716,-0.158582,0.361832,...,0.104367,-0.169663,-0.306278,1.18699,-1.057866,0.846419,0.678672,0.437749,-0.499953,0.648647
43,0.0,0.38697,0.13922,-0.22102,-0.21681,-0.017924,0.13965,0.33349,0.41221,0.30668,...,1.012167,1.308194,-1.412457,1.490609,0.081146,0.564483,0.565845,-0.988205,0.014638,0.279144
69,0.0,0.24928,-0.074078,-0.30573,-0.099511,0.487854,-0.124822,-0.108838,0.333928,0.365122,...,0.389263,1.207788,-0.3192,0.122495,0.507993,1.242028,-1.187978,-0.311825,1.130247,0.457562
18,1.0,0.553698,0.246012,0.1592,0.281089,0.597594,0.208428,-0.07345,-0.503602,-0.101518,...,0.658334,-0.651477,-0.683628,-0.944904,0.567008,1.646727,0.704488,1.450657,-0.151315,0.078456
3,0.0,0.310295,0.06997,-0.068285,-0.128501,0.573024,0.271528,-0.036618,-0.301912,-0.015966,...,-0.152198,-0.387212,-0.266698,-0.719916,-1.431263,-0.161502,-1.470127,0.703346,0.073186,-0.128584


In [21]:
(X_train, X_test, y_train, y_test) = data_partition(noisy_data_uniform)

model_RF = RandomForestClassifier(random_state=0)
param_grid_RF = {
    "n_estimators": [100, 250, 500, 750, 1000],
    "criterion": ["gini", "entropy"],
    "max_depth": [10, 50, 100, 200, 400]
}
grid_search_RF = GridSearchCV(estimator=model_RF, param_grid=param_grid_RF, cv=4)

grid_search_RF.fit(X_train, y_train)
model_RF_opt = grid_search_RF.best_estimator_
# Predicción en partición de test
y_pred_RF = model_RF_opt.predict(X_test)
# Precisión en partición de test
accuracy = accuracy_score(y_test, y_pred_RF)
print("Accuracy with Data Augmentation: {:0.2f}%".format(accuracy * 100))

Accuracy with Data Augmentation: 91.43%


# Pseudo-labeling

https://towardsdatascience.com/pseudo-labeling-to-deal-with-small-datasets-what-why-how-fd6f903213af

In [9]:
import tensorflow as tf
import keras
from keras import layers, models, optimizers, callbacks, backend, preprocessing

In [26]:
tf.keras.utils.set_random_seed(0)

modelFC = models.Sequential()
modelFC.add(layers.Dense(200, activation="relu", input_shape=(410,)))
modelFC.add(layers.Dropout(0.3))
modelFC.add(layers.Dense(200, activation="relu"))
modelFC.add(layers.Dropout(0.3))
modelFC.add(layers.Dense(200, activation="relu"))
modelFC.add(layers.Dropout(0.3))
modelFC.add(layers.Dense(200, activation="relu"))
modelFC.add(layers.Dropout(0.3))
modelFC.add(layers.Dense(1, activation="sigmoid"))

modelFC.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
modelFC.fit(X_train, y_train, epochs=100)

# Precisión en partición de test
loss, accuracy = modelFC.evaluate(X_test, y_test)
print("Accuracy: {:0.2f}%".format(accuracy * 100))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [18]:
def alpha_epoch(epoch, val, start, stop):
    if epoch < start:
        alpha = 0
    elif epoch < stop:
        alpha = ((epoch-start) / (stop-start)) * val
    else:
        alpha = val
    return alpha

In [24]:
tf.keras.utils.set_random_seed(0)

modelFC = models.Sequential()
modelFC.add(layers.Dense(200, activation="relu", input_shape=(410,)))
modelFC.add(layers.Dropout(0.3))
modelFC.add(layers.Dense(200, activation="relu"))
modelFC.add(layers.Dropout(0.3))
modelFC.add(layers.Dense(200, activation="relu"))
modelFC.add(layers.Dropout(0.3))
modelFC.add(layers.Dense(200, activation="relu"))
modelFC.add(layers.Dropout(0.3))
modelFC.add(layers.Dense(1, activation="sigmoid"))

modelFC.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])

X_tot = np.concatenate((X_train, test))
y_train = np.reshape(y_train.to_numpy(), (68, 1))
for i in range(1, 101): # 100 épocas
    pseudolabels = modelFC.predict(test)
    y_tot = np.concatenate((y_train, pseudolabels))
    alpha = alpha_epoch(i, 2, 20, 80)
    samples = np.concatenate((np.ones(len(y_train)), alpha*np.ones(len(pseudolabels))))
    modelFC.fit(X_train, y_train, sample_weight=samples, epochs=1, validation_split=0.25)

(119816, 410)


In [25]:
# Precisión en partición de test
loss, accuracy = modelFC.evaluate(X_test, y_test)
print("Accuracy: {:0.2f}%".format(accuracy * 100))

Accuracy: 77.78%


# Create submissions

In [24]:
import pathlib
from datetime import datetime

def create_submission(pred, test_id=testFNC["Id"]):
    submissionDF = pd.DataFrame(list(zip(test_id, pred)), columns=["Id", "Probability"])
    print(submissionDF.shape) # Comprobación del tamaño, debe ser: (119748, 2)
    current_time = datetime.now().strftime("%d-%m-%Y_%Hh%Mmin")
    current_path = pathlib.Path().resolve()
    submissionDF.to_csv(f"{current_path}\submissions\MLSP_submission_DataAug_{current_time}.csv", header=True, index=False)
    
create_submission(pred=model_RF_opt.predict(test))

(119748, 2)
