# Clasificador basado en CNN + LSTM (GRU)
Basado en https://github.com/jiegzhan/multi-class-text-classification-cnn-rnn

Analizar esto luego: https://www.quora.com/Intuitively-how-does-mini-batch-size-affect-the-performance-of-stochastic-gradient-descent

Esto explica como usa rel pr_curve https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/pr_curve/README.md

In [1]:
#Hacer imports
import os
import json
import time
import logging
import numpy as np
import tensorflow as tf
import pandas as pd  
from text_cnn_rnn import TextCNNRNN
from tensorflow.contrib import learn
from sklearn.model_selection import train_test_split
from sklearn import metrics
from pathlib import Path
import pickle

logging.getLogger().setLevel(logging.INFO)

archivoTweets = 'tweets_limpios.csv'

params = {
            "num_epochs": 100,
            "batch_size": 32,
            "filter_sizes": "3,5,7,9",
            "embedding_dim": 128,
            "num_filters": 128,
            "l2_reg_lambda": 0.1,
            "evaluate_every": 100,
            "dropout_keep_prob": 0.5,
            "max_pool_size": 4,
            "hidden_unit": 300
        }

#Cargar stopwords
#df = pd.read_csv("Stopwords.txt",header=None)
#spanish_stopwords = df[0].values.tolist()
#print("Stopwords cargados")

  from ._conv import register_converters as _register_converters


In [2]:
#Método que carga los textos pre-procesados con sus polaridades y los prepara para el entrenamiento
def cargar_datos_etiquetas(filename):
    """Carga texto y polaridad"""
    df = pd.read_csv(filename)
    selected = ['Texto', 'Polaridad']
    non_selected = list(set(df.columns) - set(selected))

    df = df.drop(non_selected, axis=1) # Elimina las columnas innecesarias
    df = df.dropna(axis=0, how='any', subset=selected) # Elimina filas con valores null
    df = df.reindex(np.random.permutation(df.index)) # Revuelve el conjunto de datos

    # Convierte las polaridades en etiquetas One-hot
    labels = sorted(list(set(df[selected[1]].tolist())))
    one_hot = np.zeros((len(labels), len(labels)), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    #Crea listas con los textos y las etiquetas en formato one-hot
    x_raw = df[selected[0]].tolist()
    y_raw = df[selected[1]].apply(lambda y: label_dict[y]).tolist()
    return x_raw, y_raw, df, labels

def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

def train_cnn():
    #Paso 0: Cargar oraciones, etiquetas y parámetros
    x_raw, y_raw, df, labels = cargar_datos_etiquetas(archivoTweets)

    #Paso 1: Obtiene vectores de las palabras y rellena los textos para tener la misma longitud
    max_document_length = max([len(x.split(' ')) for x in x_raw])
    logging.info(" Oración más larga tiene {} palabras. Se agregan 5 para tener espacio de manipulación para nuevas oraciones".format(max_document_length))
    max_document_length += 5
    
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)    
    my_file = Path("fasttext_vocabulario.dat")
    preentrenado = my_file.is_file()
    if preentrenado:
        logging.info(" Cargando vectores generados previamente")
        with open('fasttext_vocabulario.dat', 'rb') as fr:
            vocab = pickle.load(fr)
        embedding = np.load('fasttext_embeddings.npy')

        pretrain = vocab_processor.fit(vocab.keys())
        x = np.array(list(vocab_processor.transform(x_raw)))
        vocab_size = len(vocab)
    else:
        logging.info(" Generando vectores")
        x = np.array(list(vocab_processor.fit_transform(x_raw)))
        vocab_size = len(vocab_processor.vocabulary_)
        
    embedding_size = params['embedding_dim']
        
    y = np.array(y_raw)

    #Paso 2: Divide el dataset original en entrenamiento y prueba
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

    #Paso 3: revuelve el dataset de entrenamiento y divide el de entrenamiento en entrenamiento y validación
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_shuffled = x_train[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]
    x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1)

    #Paso 4: guarda las etiquetas en un archivo JSON: labels.json para hacer predicciones luego
    with open('labels.json', 'w') as outfile:
        json.dump(labels, outfile, indent=4)

    logging.info(' x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test)))
    logging.info(' y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test)))

    #Paso 5: Construir el grafo y el objeto CNN
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNNRNN(
                sequence_length=x_train.shape[1],
                num_classes = y_train.shape[1],
                vocab_size=vocab_size,
                hidden_unit=params['hidden_unit'],
                max_pool_size=params['max_pool_size'],
                embedding_size = params['embedding_dim'],
                filter_sizes=map(int, params['filter_sizes'].split(",")),
                num_filters = params['num_filters'],
                l2_reg_lambda = params['l2_reg_lambda'],
                pre_trained=preentrenado)

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "trained_model_" + timestamp))

            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables())
            
            def real_len(batches):
                return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches]
            
            # One training step: train the model with one batch
            def train_step(x_batch, y_batch):
                if preentrenado:
                    feed_dict = {
                        cnn.input_x: x_batch,
                        cnn.input_y: y_batch,
                        cnn.dropout_keep_prob: params['dropout_keep_prob'],
                        cnn.embedding_placeholder: embedding,
                        cnn.batch_size: len(x_batch),
                        cnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                        cnn.real_len: real_len(x_batch)
                    }
                else:
                    feed_dict = {
                        cnn.input_x: x_batch,
                        cnn.input_y: y_batch,
                        cnn.dropout_keep_prob: params['dropout_keep_prob'],
                        cnn.batch_size: len(x_batch),
                        cnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                        cnn.real_len: real_len(x_batch)
                    }
                _, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

            # One evaluation step: evaluate the model with one batch
            def dev_step(x_batch, y_batch):
                if preentrenado:
                    feed_dict = {
                        cnn.input_x: x_batch,
                        cnn.input_y: y_batch,
                        cnn.dropout_keep_prob: 1.0,
                        cnn.embedding_placeholder: embedding,
                        cnn.batch_size: len(x_batch),
                        cnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                        cnn.real_len: real_len(x_batch),
                    }
                else:
                    feed_dict = {
                        cnn.input_x: x_batch,
                        cnn.input_y: y_batch,
                        cnn.dropout_keep_prob: 1.0,
                        cnn.batch_size: len(x_batch),
                        cnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                        cnn.real_len: real_len(x_batch),
                    }
                step, loss, acc, num_correct = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.num_correct], feed_dict)
                #return num_correct
                return acc

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))
            
            #Inicializa las variables del clasificador
            sess.run(tf.global_variables_initializer())

            # Training starts here
            train_batches = batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs'])
            best_accuracy, best_at_step = 0, 0

            #Paso 6: entrenar el modelo de CNN con x_train y y_train (batch por batch)
            logging.info(" Inicio de entrenamiento")
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)

                if current_step % params['evaluate_every'] == 0:
                    logging.info(" Etapa: {}".format(current_step))
                    #Paso 6.1: evaluar el modelo con x_dev y y_dev                  
                    dev_accuracy = dev_step(x_dev, y_dev)
                    
                    logging.critical(' Exactitud en set de validación: {}'.format(dev_accuracy))

                    if dev_accuracy >= best_accuracy:
                        #Paso 6.2: Guardar el modelo si es el mejor basado en exactitud en el set de validación
                        best_accuracy, best_at_step = dev_accuracy, current_step
                        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        logging.critical(' Modelo guardado en {} en etapa {}'.format(path, best_at_step))
                        logging.critical(' Mejor exactitud es {} en etapa {}'.format(best_accuracy, best_at_step))

            #Paso 7: Predecir x_test
            test_accuracy = dev_step(x_test, y_test)            
            logging.critical(' Exactitud en set de pruebas es {} basado en el último modelo {}'.format(test_accuracy, path))
            
            checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            logging.critical(' Cargado el mejor modelo para hacer las pruebas: {}'.format(checkpoint_file))
            test_accuracy = dev_step(x_test, y_test)
            logging.critical(' Exactitud en set de pruebas es {} basado en el mejor modelo {}'.format(test_accuracy, path))
            logging.critical(' Entrenamiento completado')

In [None]:
train_cnn()

INFO:root: Oración más larga tiene 47 palabras. Se agregan 5 para tener espacio de manipulación para nuevas oraciones


Instructions for updating:
Please use tensorflow/transform or tf.data.


Instructions for updating:
Please use tensorflow/transform or tf.data.


Instructions for updating:
Please use tensorflow/transform or tf.data.


Instructions for updating:
Please use tensorflow/transform or tf.data.
INFO:root: Cargando vectores generados previamente
INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


Instructions for updating:
Please use tensorflow/transform or tf.data.


Instructions for updating:
Please use tensorflow/transform or tf.data.
INFO:root: x_train: 6255, x_dev: 696, x_test: 773
INFO:root: y_train: 6255, y_dev: 696, y_test: 773
INFO:root: Inicio de entrenamiento
INFO:root: Etapa: 100
CRITICAL:root: Exactitud en set de validación: 0.4295977056026459
CRITICAL:root: Modelo guardado en D:\respaldo joax\UCR\Maestria computacion\2018-1\NPL\Deep learning\trained_model_1528941367\checkpoints\model-100 en etapa 100
CRITICAL:root: Mejor exactitud es 0.4295977056026459 en etapa 100
INFO:root: Etapa: 200
CRITICAL:root: Exactitud en set de validación: 0.45258620381355286
CRITICAL:root: Modelo guardado en D:\respaldo joax\UCR\Maestria computacion\2018-1\NPL\Deep learning\trained_model_1528941367\checkpoints\model-200 en etapa 200
CRITICAL:root: Mejor exactitud es 0.45258620381355286 en etapa 200
INFO:root: Etapa: 300
CRITICAL:root: Exactitud en set de validación: 0.4497126340866089
INFO:root: Etapa: 400
CRITICAL:root: Exactitud en set de validación: 0.451