# Predictor basado en CNN
Basado en https://github.com/satojkovic/cnn-text-classification-tf/tree/use_fasttext y https://github.com/jiegzhan/multi-class-text-classification-cnn

## Se importa el texto a predecir

In [6]:
import os
import sys
import json
import logging
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from pathlib import Path
from bs4 import BeautifulSoup
from tensorflow.contrib import learn
from Preprocesador import Preprocesador

logging.getLogger().setLevel(logging.INFO)

carpetaModeloEntrenado = 'trained_model_1529315717'
preprocesador = Preprocesador()

[':-)', ':-))', '(-:', '((-:', ':)', ':))', '(:', '((:', ':-]', '[-:']
[':-(', ':(', ':-c', ':c', ':-<', ':<', ':-[', ':[', ':-||', '>:[']


## Correr este si se va a cargar un CSV

In [None]:
csvPrueba = 'general-tweets-test1k.csv'

#Definir el nombre de las columnas del archivo
cols = ['id','usuario','Texto','Fecha','Idioma','Polaridad']

#Importar el archivo en memoria 
#Se indica que no tiene encabezados
#Se usan los nombres definidos anteriormente los nombres de las columnas
df = pd.read_csv(csvPrueba,header=None, names=cols)

#Imprimir las primeras líneas del archivo para validar una carga correcta
df.head()

## Correr este si se va a cargar un XML

In [15]:
xmlPrueba = 'TASS2017_T1_test.xml'

#Definir el nombre de las columnas del archivo
cols = ['id','usuario','Texto','Fecha','Idioma','Polaridad']
df = pd.DataFrame(columns=cols)

def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None
        
#Cargar el xml y convertirlo a un DataFrame
etree = ET.parse(xmlPrueba) #create an ElementTree object 
for node in etree.getroot():
        tweetId = node.find('tweetid')
        user = node.find('user')
        content = node.find('content')
        date = node.find('date')
        lang = node.find('lang')
        polarity = node.find('sentiment/polarity/value')
        
        df = df.append(
            pd.Series([getvalueofnode(tweetId), getvalueofnode(user), getvalueofnode(content),
                       getvalueofnode(date), getvalueofnode(lang), getvalueofnode(polarity)], 
                      index=cols), ignore_index=True)

#Imprimir las primeras líneas del archivo para validar una carga correcta
df.head()

Unnamed: 0,id,usuario,Texto,Fecha,Idioma,Polaridad
0,770567971701940224,wikimiscojones,@LonelySoad mientras que no te pillen la prime...,2016-08-30 10:24:55,es,
1,770503386789711872,HLF_Metr4spt,@ceemeese ya era hora de volver al csgo y deja...,2016-08-30 06:08:17,es,
2,770502863017635840,AVazquez_C,@mireiaescribano justo cuando se terminan las ...,2016-08-30 06:06:12,es,
3,770599972102348800,minniecris,@LuisMartinez22_ pensba q iba a hacer @wxplosi...,2016-08-30 12:32:05,es,
4,770599962216390656,VI_Lelouch,"@Vic_Phantomhive Si lo encuentro, sin compañer...",2016-08-30 12:32:02,es,


## Se va a pre-procesar el texto

In [16]:
clean_df = df.copy()

print ("Pre procesando tweets...\n")
clean_tweet_texts = []
for i in range(df.Texto.count()):
    if( (i+1)%1000 == 0 ):
        print("%d de %d tweets procesados" % ( i+1, df.Texto.count() ))
    clean_tweet_texts.append(preprocesador.tweetCleaner(str(df['Texto'][i])))
print ("Pre procesamiento completado")

clean_df['Texto'] = clean_tweet_texts
clean_df.head()

Pre procesando tweets...

1000 de 1899 tweets procesados
Pre procesamiento completado


Unnamed: 0,id,usuario,Texto,Fecha,Idioma,Polaridad
0,770567971701940224,wikimiscojones,lonelysoad mientras que no te pillen la primer...,2016-08-30 10:24:55,es,
1,770503386789711872,HLF_Metr4spt,ceemeese ya era hora de volver al csgo y dejar...,2016-08-30 06:08:17,es,
2,770502863017635840,AVazquez_C,mireiaescribano justo cuando se terminan las f...,2016-08-30 06:06:12,es,
3,770599972102348800,minniecris,luismartinezxnumx pensba q iba a hacer wxplosi...,2016-08-30 12:32:05,es,
4,770599962216390656,VI_Lelouch,vicphantomhive si lo encuentro sin compañeros ...,2016-08-30 12:32:02,es,


In [17]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [36]:
def predict_unseen_data():
    """Step 0: load trained model and parameters"""
    checkpoint_dir = carpetaModeloEntrenado
    if not checkpoint_dir.endswith('/'):
        checkpoint_dir += '/'
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir + 'checkpoints')
    out_dir = os.path.abspath(os.path.join(os.path.curdir, carpetaModeloEntrenado))
    params = json.loads(open(os.path.join(out_dir, 'parametros.json')).read())
    logging.critical('Loaded the trained model: {}'.format(checkpoint_file))

    """Step 1: load data for prediction"""
    test_file = sys.argv[2]
    test_examples = json.loads(open(test_file).read())

    # labels.json was saved during training, and it has to be loaded during prediction
    labels = json.loads(open(os.path.join(out_dir, 'labels.json')).read())
    one_hot = np.zeros((len(labels), len(labels)), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    x_test = [str(example) for example in clean_df.Texto]
    logging.info('The number of x_test: {}'.format(len(x_test)))

    y_test = None

    vocab_path = os.path.join(checkpoint_dir, "vocab")
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
    x_test = np.array(list(vocab_processor.transform(x_test)))
    my_file = Path("fasttext_vocabulario.dat")
    preentrenado = my_file.is_file()
    if preentrenado:
        print('Load pre-trained word vectors')
        embedding = np.load('fasttext_embeddings.npy')

    """Step 2: compute the predictions"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            input_x = graph.get_operation_by_name("input_x").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
            embedding_placeholder = graph.get_operation_by_name('embedding/pre_trained').outputs[0]
            predictions = graph.get_operation_by_name("output/predictions").outputs[0]

            batches = batch_iter(list(x_test), params['batch_size'], 1, shuffle=False)
            all_predictions = []
            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {input_x: x_test_batch, embedding_placeholder: embedding, dropout_keep_prob: 1.0})
                all_predictions = np.concatenate([all_predictions, batch_predictions])

    if y_test is not None:
        y_test = np.argmax(y_test, axis=1)
        correct_predictions = sum(all_predictions == y_test)

        logging.critical('The accuracy is: {}'.format(correct_predictions / float(len(y_test))))
        logging.critical('The prediction is complete')

    # Save the actual labels back to file
    actual_labels = [labels[int(prediction)] for prediction in all_predictions]
    
    for i in range(df.Texto.count()):
        df.Polaridad[i] = actual_labels[i]

In [37]:
predict_unseen_data()
df.head()

CRITICAL:root:Loaded the trained model: D:\respaldo joax\UCR\Maestria computacion\2018-1\NPL\Deep learning\trained_model_1529315717\checkpoints\model-5600
INFO:root:The number of x_test: 1899


Load pre-trained word vectors
INFO:tensorflow:Restoring parameters from D:\respaldo joax\UCR\Maestria computacion\2018-1\NPL\Deep learning\trained_model_1529315717\checkpoints\model-5600


INFO:tensorflow:Restoring parameters from D:\respaldo joax\UCR\Maestria computacion\2018-1\NPL\Deep learning\trained_model_1529315717\checkpoints\model-5600


Unnamed: 0,id,usuario,Texto,Fecha,Idioma,Polaridad
0,770567971701940224,wikimiscojones,@LonelySoad mientras que no te pillen la prime...,2016-08-30 10:24:55,es,N
1,770503386789711872,HLF_Metr4spt,@ceemeese ya era hora de volver al csgo y deja...,2016-08-30 06:08:17,es,N
2,770502863017635840,AVazquez_C,@mireiaescribano justo cuando se terminan las ...,2016-08-30 06:06:12,es,N
3,770599972102348800,minniecris,@LuisMartinez22_ pensba q iba a hacer @wxplosi...,2016-08-30 12:32:05,es,N
4,770599962216390656,VI_Lelouch,"@Vic_Phantomhive Si lo encuentro, sin compañer...",2016-08-30 12:32:02,es,P


In [38]:
csv = 'tweets_evaluados.csv'
df.to_csv(csv,encoding='utf-8')

In [48]:
def crearXML(df, filename=None, mode='w'):
    def row_to_xml(row):
        xml = ['	<tweet>']
        for i, col_name in enumerate(row.index):
            if i == 0:
                xml.append('		<tweetid>{0}</tweetid>'.format(row.iloc[i]))
            elif i == 1:
                xml.append('		<user>{0}</user>'.format(row.iloc[i]))
            elif i == 2:
                xml.append('		<content>{0}</content>'.format(row.iloc[i]))
            elif i == 3:
                xml.append('		<date>{0}</date>'.format(row.iloc[i]))
            elif i == 4:
                xml.append('		<lang>{0}</lang>'.format(row.iloc[i]))
            elif i == 5:
                xml.append('		<sentiment>')
                xml.append('			<polarity><value>{0}</value></polarity>'.format(row.iloc[i]))
                xml.append('		</sentiment>')
        xml.append('	</tweet>')
        return '\n'.join(xml)
    res = '<?xml version="1.0" encoding="UTF-8"?>\n<tweets>\n'
    res += '\n'.join(df.apply(row_to_xml, axis=1))
    res += '\n</tweets>\n'

    if filename is None:
        return res
    with open(filename, mode, encoding='utf-8') as f:
        f.write(res)

crearXML(df,'TASS2017_T1_test_lleno.xml')