In [67]:
# Pandas - Numpy (import data, manipulation)
import pandas as pd
import numpy as np

# Scikit-learn (train|test split, scaler)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# TensorFlow - Keras (Model, metrics, transformations and layers)
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.metrics import RootMeanSquaredError,MeanAbsoluteError
from tensorflow.keras.models import Model
from keras.utils.np_utils import to_categorical 

In [68]:
data = pd.read_csv("data_final.csv") # Importamos el dataset obtenido en la seccion de limpieza de datos.
data.head()

Unnamed: 0,exoplanet_type,0.3000004285720408,0.3045686178256013,0.3091368070791618,0.3137049963327223,0.3182731855862828,0.3228413748398433,0.3274095640934038,0.33197775334696433,0.33654594260052484,...,13.958886296717955,13.963454485971514,13.968022675225075,13.972590864478637,13.977159053732196,13.981727242985757,13.986295432239316,13.990863621492878,13.995431810746439,13.999999999999998
0,ST,0.300117,0.300117,0.300117,0.300117,0.300117,0.300117,0.300117,0.300117,0.300117,...,0.302737,0.302734,0.302731,0.302729,0.302727,0.302725,0.302724,0.302722,0.302721,0.30272
1,ST,0.300104,0.300104,0.300104,0.300104,0.300104,0.300104,0.300104,0.300104,0.300104,...,0.302836,0.302833,0.30283,0.302827,0.302825,0.302823,0.302821,0.30282,0.302819,0.302818
2,ST,0.30009,0.30009,0.30009,0.30009,0.30009,0.30009,0.30009,0.30009,0.30009,...,0.302849,0.302844,0.30284,0.302837,0.302833,0.30283,0.302828,0.302826,0.302824,0.302823
3,ST,0.300037,0.300037,0.300037,0.300037,0.300037,0.300037,0.300037,0.300037,0.300037,...,0.303099,0.303094,0.303089,0.303085,0.303081,0.303078,0.303075,0.303073,0.303071,0.303069
4,ST,0.300115,0.300115,0.300115,0.300115,0.300115,0.300115,0.300115,0.300115,0.300115,...,0.302595,0.302591,0.302588,0.302585,0.302582,0.302579,0.302577,0.302575,0.302574,0.302573


In [69]:
target = ['exoplanet_type'] # Seleccionamos la columna predictora
#predictors = data.columns.values.tolist()[2:] #
target_values = data[target]['exoplanet_type'].unique() # Seleccionamos las categorias que puede tener cada asteroide, son 23 en total.
target_numeric = np.arange(0,len(target_values)) # Se crea una lista de numeros enteros entre 0 y 23
dic_target = {target_values[i]:target_numeric[i] for i in range(len(target_numeric)) } # Se construye un diccionario que asigne cada numero a una categoria

In [70]:
data.insert(0,"type_numeric",data["exoplanet_type"].apply(lambda x: dic_target[x]) ) # Se crea una nueva columna de categorias pero numericas.
data.drop(["exoplanet_type"],axis=1,inplace=True) # Se deja solamente la columna categorica numerica (al final)
data.head()

Unnamed: 0,type_numeric,0.3000004285720408,0.3045686178256013,0.3091368070791618,0.3137049963327223,0.3182731855862828,0.3228413748398433,0.3274095640934038,0.33197775334696433,0.33654594260052484,...,13.958886296717955,13.963454485971514,13.968022675225075,13.972590864478637,13.977159053732196,13.981727242985757,13.986295432239316,13.990863621492878,13.995431810746439,13.999999999999998
0,0,0.300117,0.300117,0.300117,0.300117,0.300117,0.300117,0.300117,0.300117,0.300117,...,0.302737,0.302734,0.302731,0.302729,0.302727,0.302725,0.302724,0.302722,0.302721,0.30272
1,0,0.300104,0.300104,0.300104,0.300104,0.300104,0.300104,0.300104,0.300104,0.300104,...,0.302836,0.302833,0.30283,0.302827,0.302825,0.302823,0.302821,0.30282,0.302819,0.302818
2,0,0.30009,0.30009,0.30009,0.30009,0.30009,0.30009,0.30009,0.30009,0.30009,...,0.302849,0.302844,0.30284,0.302837,0.302833,0.30283,0.302828,0.302826,0.302824,0.302823
3,0,0.300037,0.300037,0.300037,0.300037,0.300037,0.300037,0.300037,0.300037,0.300037,...,0.303099,0.303094,0.303089,0.303085,0.303081,0.303078,0.303075,0.303073,0.303071,0.303069
4,0,0.300115,0.300115,0.300115,0.300115,0.300115,0.300115,0.300115,0.300115,0.300115,...,0.302595,0.302591,0.302588,0.302585,0.302582,0.302579,0.302577,0.302575,0.302574,0.302573


In [71]:
data.shape

(180, 3001)

In [72]:
data.type_numeric.value_counts()

0    90
1    90
Name: type_numeric, dtype: int64

In [73]:
y = np.int32(data.to_numpy()[:,0]) # Seleccionamos los valores de la columna predictora numerica
np.unique(y) # vemos los valores unicos.

array([0, 1])

In [74]:
y_one_hot = to_categorical(y, num_classes=6)
pd.DataFrame(y_one_hot).to_csv("y_data.csv",index=False)

In [75]:
X = data.to_numpy()[:,1:] # Seleccionamos los valores de las 1000 longitudes de onda.
pd.DataFrame(X).to_csv("X_data.csv",index=False) # Guardamos estos valores para utilizarlos luego.

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=500) # Realizamos un split de los datos a entrenamiento y prueba. Random state = 27
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(144, 3000)
(144, 6)
(36, 3000)
(36, 6)


In [77]:
pd.DataFrame(y_train).to_csv("y_train.csv",index=False)
pd.DataFrame(y_test).to_csv("y_test.csv",index=False)

In [78]:
scaler = StandardScaler() # Se construye una funcion para escalar los datos predictores "X" tanto para training como testing utilizando un Standard Scaler.
X_train_scaled = scaler.fit_transform(X_train) # valor escalado de X_train
X_test_scaled = scaler.fit_transform(X_test) # valor escalado de X_test

In [79]:
encoding_dim = 500 # Seleccionamos la cantidad de columnas que queremos reducir en el autoencoding.
batch_size = 32
epochs = 100
Metrics = [RootMeanSquaredError(name='rms'), MeanAbsoluteError(name='mae')]


def trainAutoencoder(Xtrain, metrics = Metrics): 
    input_output = Xtrain.shape[-1] # Seleccionamos el numero de columnas, en nuestro caso son 1000
    input_ = Input(shape=(input_output,)) # Recreamos la tupla X_train[0].shape --> (1000,)
    encoded = Dense(units=encoding_dim*2,activation="relu")(input_) # Primera capa sera de tamaño 100*2, con entrada el X_train_scaled
    bottleneck = Dense(units=encoding_dim,activation="relu")(encoded) # Segunda capa sera de tamaño 100, con entrada encoded
    decoded = Dense(units=encoding_dim*2,activation="relu")(bottleneck) # Tercera capa sera de tamaño 100*2, con entrada bottleneck
    output = Dense(units=input_output,activation='sigmoid')(decoded) # Output tendra tamaño 1000, con activacion sigmoid y entrada decoded

    autoencoder = Model(inputs=input_, outputs=output) # Se define el modelo 
    autoencoder.compile(optimizer='sgd', loss='mean_squared_error', metrics=[metrics]) # Se utilizara sgd, mean_squared_error y las metricas definidas
    autoencoder.fit(Xtrain,Xtrain,batch_size=batch_size,epochs=epochs, verbose=0) # Se entrena el modelo respecto a las X ya que es lo que se busca reducir.
    
    encoder = Model(inputs=input_,outputs=bottleneck) # Modelo que muestra el output del encoded
    return autoencoder,encoder

In [80]:
_,encoded = trainAutoencoder(X_train_scaled) # Seleccionamos el encoded que muestra el dataset reducido.
data_test_final = encoded.predict(X_test_scaled) # Observamos como fue el resultado del autoencoding para los X_test.
data_train_final = encoded.predict(X_train_scaled) # Observamos como fue el resultado del autoencoding para los X_train.
pd.DataFrame(data_test_final).to_csv("X_test_autoencoded1.csv",index=False) # Guardamos los X_Test resultados del autoencoding
pd.DataFrame(data_train_final).to_csv("X_data_autoencoded1.csv",index=False) # Guardamos los X_train resultados del autoencoding



In [81]:
print(pd.DataFrame(data_test_final).shape) # vemos que se redujo de 1000 a 100 columnas utilizando autoencoders
print(pd.DataFrame(data_train_final).shape)

(36, 500)
(144, 500)
