In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import ast, os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import plot_confusion_matrix, precision_recall_fscore_support
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import model_from_yaml
tf.__version__

Using TensorFlow backend.


'2.2.0'

In [4]:
if os.getcwd()[0] == 'C':
    df = pd.read_csv('../Data/Flujo1.csv', sep=";")
else:
    from google.colab import drive
    drive.mount('/content/drive/')
    drive_route = 'drive/My Drive/Tesis/Data/'
    df = pd.read_csv(drive_route+'/Flujo1.csv', sep=";")
df['Respuesta'] = df['Respuesta'].apply(ast.literal_eval)

def enumerate_dimensions(dimension, list_dimensions): 
    return list_dimensions.index(dimension)
def wordsToNumbers(tokens, vocabulary):
    number_array = []
    for i in tokens:
        number_array.append(vocabulary.index(i)+1)
    return np.asarray(number_array)

In [5]:
# Se Construye Dataframe solo de medio ambiente
df = df[df.Area == 'Social Interno']
# Lista de dimensiones se pasa a numeros
list_dimensions = df.Dimension.unique().tolist()
num_classes = len(list_dimensions)
df['_dimension'] = df['Dimension'].apply(enumerate_dimensions, list_dimensions = list_dimensions)

#Generamos un vocabulario de palabras
vocabulary = []
df['Respuesta'].apply(vocabulary.append)
vocabulary = [item for sublist in vocabulary for item in sublist]
vocabulary.append('')
vocabulary = list(sorted(set(vocabulary)))
vocab_len= len(vocabulary)+1

df['NumRespuesta'] = df.Respuesta.apply (wordsToNumbers, vocabulary = vocabulary)

# Rellenamos las matrices con 0 para que todas tengan el mismo tamaño
X_completo = pad_sequences(df.NumRespuesta, maxlen=10, dtype='object', padding='post', value = 0)
y_completo = np.array(df['_dimension'])

print('Media', np.array(df.groupby('_dimension').count().NumRespuesta).mean())
print('STD', np.std(df.groupby('_dimension').count().NumRespuesta))
print('Cant. datos', np.sum(df.groupby('_dimension').count().NumRespuesta))
print('Cant. datos', np.min(df.groupby('_dimension').count().NumRespuesta))
print('Cant. datos min ', np.min(df.groupby('_dimension').count().NumRespuesta))
print('Cant. datos max ', np.max(df.groupby('_dimension').count().NumRespuesta))

df.groupby('_dimension').count()

Media 5708.666666666667
STD 2110.6862496459403
Cant. datos 51378
Cant. datos 2579
Cant. datos min  2579
Cant. datos max  10057


Unnamed: 0_level_0,Area,Dimension,Respuesta,NumRespuesta
_dimension,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5676,5676,5676,5676
1,6735,6735,6735,6735
2,2579,2579,2579,2579
3,6330,6330,6330,6330
4,2580,2580,2580,2580
5,10057,10057,10057,10057
6,6084,6084,6084,6084
7,5571,5571,5571,5571
8,5766,5766,5766,5766


# Busqueda de Hyperparametros

In [None]:
def gen_train_model(tensor_X, tensor_y, tensor_test_x, tensor_test_y, paramsEmbedding, paramsLSTM, num_clases, optimizer, batch_size, epoch):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim = paramsEmbedding['input_dim'],  output_dim = paramsEmbedding['output_dim'], input_shape=(tensor_X.shape[1],)))
    model.add(tf.keras.layers.LSTM(units = paramsLSTM['units'],
                                   activation = paramsLSTM['activation'],
                                   dropout = paramsLSTM['dropout'],
                                   recurrent_dropout = paramsLSTM['recurrent_dropout']))
    
    model.add(tf.keras.layers.Dense(units = num_clases ,activation='softmax'))
    
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
    
    history = model.fit(tensor_X,
                        tensor_y,
                        epochs = epoch,
                        batch_size = batch_size,
                        verbose = 0
                       )
    test_loss, test_acurracy = model.evaluate(tensor_test_x,
                                              tensor_test_y,
                                              verbose=0
                                             )
    
    predictions = model.predict_classes(tensor_test_x)
    medidas = precision_recall_fscore_support(tensor_test_y, predictions )
    print(medidas[0])
    return [test_acurracy, test_loss, medidas, paramsEmbedding, paramsLSTM, optimizer, batch_size, epoch]


def grid_lstm(data_x, data_y, input_dim, output_dim_embe, units_lstm, activ_lstm, drop_lstm, rec_drop_lstm, num_clases, optimizers, batch_size_, epochs):
    X, test_x, y, test_y = train_test_split(data_x, data_y, test_size = 0.1, random_state = 0)
    tensor_X      = tf.convert_to_tensor(list(X))
    tensor_test_x = tf.convert_to_tensor(list(test_x))
    tensor_y      = tf.convert_to_tensor(list(y))
    tensor_test_y = tf.convert_to_tensor(list(test_y))
    resultados = []
    for output_dim in output_dim_embe:
        for units in units_lstm:
            for activ in activ_lstm:
                for drop in drop_lstm:
                    for rec_drop in rec_drop_lstm:
                        for optimizer in optimizers:
                            for batch_size in batch_size_:
                                for epoch in epochs:
                                    resultados.append( gen_train_model(tensor_X, tensor_y, tensor_test_x, tensor_test_y,  {'input_dim': input_dim, 'output_dim':output_dim}, 
                                                    {'units':units,'activation':activ,'dropout':drop, 'recurrent_dropout':rec_drop},
                                                   num_clases, optimizer, batch_size, epoch))
    return resultados

output = grid_lstm(data_x = X_completo,
          data_y = y_completo,
          input_dim = len(vocabulary)+1,
          num_clases = len(list_dimensions),
          output_dim_embe = [60, 90],
          units_lstm = [60, 90],
          activ_lstm = ['tanh', 'relu'],
          drop_lstm = [0.3,0.5,0.7],
          rec_drop_lstm = [0.3,0.5,0.7],
          optimizers = ['rmsprop','adam'],
          batch_size_ = [20, 40],
          epochs = [6,8]
         )


In [None]:
epoch = 6
batch_size = 40
optimizer = 'rmsprop'
recurrent_dropout = 0.7
dropout = 0.3
activation_lstm = 'relu'
lstm_units = 90
ouput_dim_embedding = 90

kf = KFold(n_splits = 10, shuffle = True, random_state = 2)
resultados = []
contador = 1
for valores_entrenamiento, valores_testeo in kf.split(X_completo):
    
    tensor_X      = tf.convert_to_tensor(list(X_completo[valores_entrenamiento]))
    tensor_test_x = tf.convert_to_tensor(list(X_completo[valores_testeo]))
    tensor_y      = tf.convert_to_tensor(list(y_completo[valores_entrenamiento]))
    tensor_test_y = tf.convert_to_tensor(list(y_completo[valores_testeo]))
    
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim = vocab_len, 
                                        output_dim = ouput_dim_embedding,
                                        input_shape=(X_completo.shape[1],)))
    
    model.add(tf.keras.layers.LSTM(units=lstm_units,
                                   activation=activation_lstm,
                                   dropout = dropout,
                                   recurrent_dropout = recurrent_dropout))
    
    model.add(tf.keras.layers.Dense(units=num_classes,
                                    activation='sigmoid'))
    
    model.compile(optimizer = optimizer,
                  loss='sparse_categorical_crossentropy',
                  metrics=['sparse_categorical_accuracy'])
    
    history = model.fit(tensor_X,
                        tensor_y,
                        epochs = epoch,
                        batch_size = batch_size,
                        verbose=0
                       )
    
    test_loss, test_acurracy = model.evaluate(tensor_test_x,
                                              tensor_test_y,
                                              verbose=0
                                             )
    predictions = model.predict_classes(tensor_test_x)
    medidas = precision_recall_fscore_support(tensor_test_y, predictions)
    resultados.append([test_loss, test_acurracy, medidas])

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [None]:
print("loss mean",np.array([res[0] for res in resultados]).mean())
print("acurracy mean",np.array([res[1] for res in resultados]).mean())
print("media precision",np.array([np.array([res[2][0][i] for res in resultados]).mean() for i in range(num_classes)]).mean())
print("media recall",np.array([np.array([res[2][1][i] for res in resultados]).mean() for i in range(num_classes)]).mean())
print("media fscore",np.array([np.array([res[2][2][i] for res in resultados]).mean() for i in range(num_classes)]).mean())
print("media support",np.array([np.array([res[2][3][i] for res in resultados]).mean() for i in range(num_classes)]).mean())

In [None]:
model.save('complete_social_ext.h5')
model_ = tf.keras.models.load_model('complete_social_ext.h5')

In [None]:
import matplotlib.pyplot as plt
history_dict = history.history
acc = history_dict['sparse_categorical_accuracy']
loss = history_dict['loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()   # clear figure

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

In [None]:
#Falta ROC Multiclase
import sklearn.metrics as metrics
fpr, tpr, threshold = metrics.roc_curve(tensor_test_y, predictions)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Antiguo

In [29]:
vocab_len= len(vocabulary)+1
num_classes = len(list_dimensions) #9
ouput_dim_embedding = 100
lstm_units = 200
ouputs_dense = num_classes
epoch = 7
batch_size = 100
optimizer = 'adam'

model = tf.keras.Sequential()
#Se agrega capa embedding que hace w2v
model.add(tf.keras.layers.Embedding(input_dim=vocab_len, output_dim = ouput_dim_embedding, input_shape=(X_completo.shape[1],)))
#model.add(tf.keras.layers.SpatialDropout1D(0.4))
model.add(tf.keras.layers.LSTM(units=lstm_units, activation='sigmoid', dropout=0.7, recurrent_dropout=0.7))
model.add(tf.keras.layers.Dense(units=ouputs_dense, activation='sigmoid'))
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
history = model.fit(X, y, epochs = epoch, batch_size=batch_size)
print()
print('Resultados con datos de testeo ')
print()
test_loss, test_acurracy = model.evaluate(test_x, test_y)
print()
print('Data Test accuracy: {}'.format(test_acurracy))
predictions = model.predict_classes(test_x)
resultados.append(test_acurracy)
#Matriz de confusion
confusion = tf.confusion_matrix(labels=test_y, predictions= predictions, dtype=tf.dtypes.int32, num_classes=num_classes)
print('-Matriz de confusion')
print(confusion.eval(session=tf.Session()))
resultados


Train on 41073 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7

Resultados con datos de testeo 


Data Test accuracy: 0.7559645771980286
-Matriz de confusion
[[ 802   36   69   23   11   53   37   67   23]
 [  16 1162   15   21    5   28   31   51   20]
 [  41   27  316   18   20   37   14   36   12]
 [  18   22   25  972   11   86   45   44   25]
 [  22   28   37   33  280   45   26   30   14]
 [  42   29   38  101   13 1588   37  100   27]
 [  28   39   13   33    5   31 1042   51   11]
 [  46   80   45   47    8  102   50  684   50]
 [  15   56   13   47    3   36   23   65  917]]


[0.75800955, 0.7533353, 0.7598598, 0.75839907, 0.7559646]

In [0]:
vocab_len= len(vocabulary)+1
num_classes = len(list_dimensions) #9
ouput_dim_embedding = 60
lstm_units = 100
ouputs_dense = num_classes
epoch = 7
batch_size = 60
optimizer = 'adam'

kf = KFold(n_splits = 10, shuffle = True, random_state = 2)
resultados = []
contador = 1
for valores_entrenamiento, valores_testeo in kf.split(X_completo):
    model = tf.keras.Sequential()
    #Se agrega capa embedding que hace w2v
    model.add(tf.keras.layers.Embedding(input_dim=vocab_len, output_dim = ouput_dim_embedding, input_shape=(X_completo.shape[1],)))
    model.add(tf.keras.layers.LSTM(units=lstm_units, activation='sigmoid'))
    model.add(tf.keras.layers.Dense(units=ouputs_dense, activation='sigmoid'))
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
    history = model.fit(X_completo[valores_entrenamiento], y_completo[valores_entrenamiento], epochs = epoch, batch_size = batch_size)
    print()
    print(contador, '-Resultados con datos de testeo ')
    print()
    test_loss, test_acurracy = model.evaluate(X_completo[valores_testeo], y_completo[valores_testeo])
    print()
    print(contador, '-Data Test accuracy: {}'.format(test_acurracy))
    resultados.append(test_acurracy)
    print()
    predictions = model.predict_classes(X_completo[valores_testeo])
    #Matriz de confusion
    confusion = tf.confusion_matrix(labels=y_completo[valores_testeo], predictions= predictions, dtype=tf.dtypes.int32, num_classes=num_classes)
    print(contador, '-Matriz de confusion')
    print(confusion.eval(session=tf.Session()))
    print()
    contador = contador + 1

Train on 46207 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7

1 -Resultados con datos de testeo 


1 -Data Test accuracy: 0.7561830282211304

1 -Matriz de confusion
[[469   6  11  27   5  40  10  18  11]
 [ 17 518   4  31   7  22  11  30  23]
 [ 32   2 150  19   9  26   6  15   4]
 [ 14   6   5 502   2  40  11   7  12]
 [ 20   6  14  30 142  25   5   7   5]
 [ 20   9   7  71   7 848   8  20   9]
 [ 43   8   4  31   4  26 492  20   3]
 [ 46  25  15  31   6  64  27 294  32]
 [ 15  14   6  36   3  22   8  17 468]]

Train on 46207 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7

2 -Resultados con datos de testeo 


2 -Data Test accuracy: 0.7518987059593201

2 -Matriz de confusion
[[413  20  63   5   9  12  24  10  12]
 [  9 617  21   3   4   7  18  14  13]
 [ 18  11 184   3   9  11   5  12   2]
 [ 14  18  40 455  10  31  23  15  23]
 [ 12  13  46   8 132  13  11   8   9]
 [ 25  30  60  42  14 709  27  29  22]
 [ 18  21  22   7  