# **Para utilizar con éxito los DNN para tabular**
 

*   Requiere un poco de esfuerzo, no esperes un proceso automatizado o excelentes resultados a la primera vez.
*   No re-investes la rueda (Tensorflow o pytorch, API = Keras Scikit-learn, Pandas)

*   Procesa cada entrada de acuerdo a su tipo (numerical, categorical, fechas, etc..)
*   Crear una arquitectura DNN teniendo en cuenta
cuenta el número de ejemplos disponibles.

*   Codificar conocimiento previo (ingeniería de características)
*   Test and tune tu red usando cross validation









## AI - Machine Learning - Deep Learning

![texto alternativo](https://www.alltechbuzz.net/wp-content/uploads/2017/08/ai-and-ml.png)

 [DeepLearning](https://playground.tensorflow.org/)

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from keras.utils import plot_model
import datetime
from keras.layers import Dropout


from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [None]:
#!pip install pandas-profiling==2.9.0rc1

## Pandas para leer el dataframe

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
ls /content/gdrive/My\ Drive/Colab\ Notebooks/data

In [None]:
_file = '/content/gdrive/My Drive/Colab Notebooks/data/data_to_deeplearning.csv'
dataframe = pd.read_csv(_file)
dataframe.tail(2)

In [None]:
import pandas_profiling as pdpf

In [None]:
profile = pdpf.ProfileReport(dataframe)
profile
profile.to_file(output_file="output_deep.html")

In [None]:
dataframe.dtypes

In [None]:
dataframe['QUINTIL'] = dataframe['QUINTIL'].astype(int)
dataframe['EDAD'] = dataframe['EDAD'].astype(int)
dataframe['ESTADO_BIA'] = dataframe['ESTADO_BIA'].astype(int)

## Dividir el dataframe en train, validacion y test

In [None]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train ejemplos')
print(len(val), 'validation ejemplos')
print(len(test), 'test ejemplos')

## Balancear el Dataset

In [None]:
from sklearn.utils import class_weight

In [None]:
dataframe.ESTADO_BIA.value_counts()

In [None]:
class_weights = list(class_weight.compute_class_weight('balanced',
                                             np.unique(dataframe['ESTADO_BIA']),
                                             dataframe['ESTADO_BIA']))

In [None]:
class_weights.sort()

In [None]:
weights={}
for index, weight in enumerate(class_weights) :
    weights[index]=weight

In [None]:
weights

## Crear el input pipeline usando la el modulo de tensorflow tf.data

In [None]:
# Un método de utilidad para crear un conjunto de datos tf.data a partir de un marco de datos de Pandas
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('ESTADO_BIA')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
batch_size = 5 # A small batch (para demostracion solamente)
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

## Entender el input

In [None]:
for feature_batch, label_batch in train_ds.take(1):
  print('Cada feature:', list(feature_batch.keys()))
  print('Un batch de Regions:', feature_batch['REGION_PROCEDENCIA'])
  print('Un batch de targets:', label_batch )

## Demostrar varios tipos de columna 




In [None]:
# Seleccionamos un batch
example_batch = next(iter(train_ds))[0]

In [None]:
# Método para crear cada columna de características
# Y transformlas a un batch de datos
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

In [None]:
feature_columns = []

## Columnas numericas

In [None]:
QUINTIL = feature_column.numeric_column("QUINTIL")
demo(QUINTIL)

In [None]:
EDAD = feature_column.numeric_column("EDAD")
demo(EDAD)

In [None]:
numerical_features = ['QUINTIL','EDAD']

### Normalización

In [None]:
def get_scal(feature):
  def minmax(x):
    mini = train[feature].min()
    maxi = train[feature].max()
    return (x - mini)/(maxi-mini)
  return(minmax)

In [None]:
for _col in numerical_features:
    _col_norm = get_scal(_col)
    feature_columns.append(feature_column.numeric_column(_col, normalizer_fn=_col_norm))

In [None]:
#feature_columns

## Columnas Bucketized 

In [None]:
EDAD_buckets = feature_column.bucketized_column(EDAD, boundaries=[18, 20, 22, 24, 26, 28])
demo(EDAD_buckets)

In [None]:
feature_columns.append(EDAD_buckets)

In [None]:
feature_columns

## Columnas categoricas

In [None]:
GRUPO_DEPENDENCIA = feature_column.categorical_column_with_vocabulary_list(
      'GRUPO_DEPENDENCIA', ['PARTICULAR SUBVENCIONADO', 'MUNICIPAL', 'PARTICULAR PAGADO'])

GRUPO_DEPENDENCIA_one_hot = feature_column.indicator_column(GRUPO_DEPENDENCIA)
demo(GRUPO_DEPENDENCIA_one_hot)

In [None]:
SEXO_ALUMNO = feature_column.categorical_column_with_vocabulary_list(
      'SEXO_ALUMNO', dataframe.SEXO_ALUMNO.unique())

SEXO_ALUMNO_one_hot = feature_column.indicator_column(SEXO_ALUMNO)
demo(SEXO_ALUMNO_one_hot)

Crear otra ustedes ....

In [None]:
indicator_columns = ['GRUPO_DEPENDENCIA','SEXO_ALUMNO']

In [None]:
for feature_name in indicator_columns:
    vocabulary = dataframe[feature_name].unique()
    cat_c = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)
    one_hot = feature_column.indicator_column(cat_c)
    feature_columns.append(one_hot)

In [None]:
feature_columns

## Embedding de columnas

In [None]:
NOMBRE_PROGRAMA = feature_column.categorical_column_with_vocabulary_list(
      'NOMBRE_PROGRAMA', dataframe.NOMBRE_PROGRAMA.unique())

NOMBRE_PROGRAMA_embedding = feature_column.embedding_column(NOMBRE_PROGRAMA, dimension=39)
demo(NOMBRE_PROGRAMA_embedding)

In [None]:
embedding_column  = ['NOMBRE_PROGRAMA','NOMBRE_ASIGNATURA','REGION_PROCEDENCIA','COMUNA_PROCEDENCIA'
,'REGION_ACADEMICA','COMUNA_ACADEMICA','RAMA_EDUCACIONAL']

In [None]:
for feature_name in embedding_column:
    vocabulary = dataframe[feature_name].unique()
    #print(len(vocabulary))
    embedding_dimensions = round(len(vocabulary)**0.25)#Calibrar el tamaño del embeddin numero de categoriasx 0.25
    #print(embedding_dimensions)
    cat_c = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)
    embeding = feature_column.embedding_column(cat_c, dimension=embedding_dimensions)
    feature_columns.append(embeding)

In [None]:
feature_columns

## Columnas Crossed feature 

In [None]:
crossed_feature = feature_column.crossed_column([REGION_PROCEDENCIA, COMUNA_PROCEDENCIA, RAMA_EDUCACIONAL], hash_bucket_size=100)
demo(feature_column.indicator_column(crossed_feature))

In [None]:
feature_columns.append(crossed_feature)

In [None]:
feature_columns

# Columnas usadas

In [None]:
print('Total number of feature coumns: ',len(feature_columns))

## Crear la capa de caracteriticas de entrada 

In [None]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
batch_size= int(dataframe.shape[0]*.001)
batch_size

In [None]:
batch_size = 24
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

## Crear, Compilar y entrenar el modelo

In [None]:
%load_ext tensorboard

In [None]:
!rm -rf ./logs/ 

In [None]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dropout(0.3),
  layers.Dense(128, activation='relu'),
  layers.Dense(1)
])

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=10,
          class_weight=weights,
          callbacks=[tensorboard_callback])

In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

In [None]:
%tensorboard --logdir logs/fit