In [10]:

import pickle
import os

from preprocessing_merged import *
from itertools import combinations_with_replacement, permutations

import tflearn as tf
import numpy as np
import pandas as pd
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell
from tensorflow.contrib.tensorboard.plugins import projector

In [3]:
#Varuables
logdir='/tmp/tflearn-logs' #Path for the logs
n_train=15000 #Number of samples from each of the classes tu use in training

In [4]:

def read_tweet_to_list(positive,full,is_train=True,max_num=None):
    '''
    Read tweets from the database
    INPUT:
        positive: boolean, if true, read the positive tweets file
        full: boolean, if true, read the tweets file with all tweets
        is_train: boolean, if true, read labeled tweets
        max_num: integer, maximum number of tweets to read
    OUTPUT:
        list with tweets read
    '''
    #Initialise list of tweets
    tweets=[] 
    
    #Name of file to read depending on the options
    base='train_'
    #Positive
    if positive:
        base+='pos'
    #Negative
    else:
        base+='neg'
    #Full database
    if full:
        base+='_full'
    #Without labels    
    if(is_train==False):
        base='test_data'
    #Add extension
    base+='.txt'
    
    #Read the tweets
    count=0 #Tweets read so far
    max_len=0 #Max length of tweet found
    
    #Open file
    with open('./twitter-datasets/{}'.format(base)) as f:
        #Read each line
        for line in f:
            #Save max tweet length found
            if(len(line)>max_len):
                max_len=len(line)
            #Add tweet to the list removing
            tweets.append(''.join(list(filter(lambda x: not x.isdigit(), line))))
            
            #Increment counter unless we reach the limit
            if(max_num):
                count+=1
                if(count>max_num):
                    break
    print ('max len of is_positive:{} tweet is {}'.format(positive,max_len))
    return tweets

In [5]:
def create_bidir_LSTM(num_vocab, dropout_bidir=0.7, dropout_lstm=0.8, logdir='/tmp/tflearn_logs/'):
    '''
    Deep neural net with one branch with bidirectional and one branch with LSTM
    INPUT:
        num_vocab: number of words in features
        dropout_bidir: dropout in bidirectional branch
        dropout_lstm: dropout LSTM branch
        logdir: path for the logs, should be the same as with the projector
    OUTPUT:
        model

    '''
    #Create SentimentalAnalysis Layer(LSTM)
    tensorflow.reset_default_graph()
    #Input layer
    net = tf.input_data([None, 100] ,name='input_layer')
    #Embedding
    net = tf.embedding(net, input_dim=num_vocab, output_dim=256,name='embedded_layer')
    
    #Bidirectional branch
    net1 = tf.bidirectional_rnn(net,BasicLSTMCell(256), BasicLSTMCell(256),return_seq=True)
    if isinstance(net1, list):
        net1 = tensorflow.stack(net1, axis=1)
    net1 = tf.dropout(net1, dropout_bidir) #Dropout
    net1 = tf.fully_connected(net1, 150, activation='relu',name='first_fc',regularizer='L2') #Fully connected layer
    
    #LTSM branch with dropout
    net2 = tf.lstm(net, 256, dropout = dropout_lstm,name = 'LSTM_layer',return_seq=True)
    #If 3d array not returned, stack into a 3D array
    if isinstance(net2, list):
        net2 = tensorflow.stack(net2, axis=1)
    net2 = tf.fully_connected(net2, 200, activation='relu',name='second_fc',regularizer='L2') #Fully connected layer
    
    #Concatenate the results from each branch
    net_final=tensorflow.concat([net1,net2],1, name="concat")
    #Connect all the cells with softmax
    net_final = tf.fully_connected(net_final, 2, activation='softmax',name='output')
    #Obtain output with categorical cross entropy
    net_final = tf.regression(net_final, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')
    
    model = tf.DNN(net_final, tensorboard_verbose=0,checkpoint_path=None, tensorboard_dir=logdir)
    return model

In [11]:

def tensorboard_projector(features, log_dir='/tmp/tflearn_logs/model/'):
    '''
    Create the tensorboard projector
    INPUT:
        features: features returned by divide_train_test
        log_dir_path: path to save the logs
    '''
    # Use the same LOG_DIR where you stored your checkpoint.
    summary_writer = tensorflow.summary.FileWriter(log_dir)
    
    # Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
    config = projector.ProjectorConfig()
    
    # You can add multiple embeddings. Here we add only one.
    embedding = config.embeddings.add()
    embedding.tensor_name = 'embedded_layer/W'
    # Link this tensor to its metadata file (e.g. labels).
    pd.Series(features).to_csv(log_dir+'metadata.tsv',sep='\n',index=False,header=False)
    
    embedding.metadata_path = log_dir+'metadata.tsv'
    
    # Saves a configuration file that TensorBoard will read during startup.
    projector.visualize_embeddings(summary_writer, config)
    

In [7]:

#Read the tweets to train ad select 15000 from each classs
neg_tweets=read_tweet_to_list(False,False,max_num=None)[:n_train]
pos_tweets=read_tweet_to_list(True,False,max_num=None)[:n_train]
test_tweets=read_tweet_to_list(False,False,False,max_num=None)

num_test_tweets=len(test_tweets)
num_train_tweets=len(pos_tweets)+len(neg_tweets)

max len of is_positive:False tweet is 226
max len of is_positive:True tweet is 218
max len of is_positive:False tweet is 177


In [22]:
#Convert to dataframe
df_neg_tweets=pd.DataFrame({'tweet':neg_tweets,'smile':list(np.zeros(len(neg_tweets)))})
df_pos_tweets=pd.DataFrame({'tweet':pos_tweets,'smile':list(np.ones(len(pos_tweets)))})
df_test_tweets=pd.DataFrame({'tweet':test_tweets,'smile':list(np.ones(len(test_tweets)))})

#Concatenate positive and negative in a single dataframe
df_full=pd.concat([df_neg_tweets,df_pos_tweets,df_test_tweets])

In [2]:
#List of possible options
opts=['length', 'negative', 'stop', 'antonyms', 'slang']

#List of possible values of each of the options
b=[True, False]
preprocessing_options=[] #List of dictionary with resulting preprocessing options

#Obtain all combinations of the options of length opts
for c in combinations_with_replacement(b, len(opts)):
    #Unique permutations of the possible combinations: [a,a,a] -> [a], [a, a, b] -> [a, a, b], [a,b,a], [b,a,a]    
    for p in set(permutations(c)):
        #Zip them with the options in a dictionary and append it to a list
        preprocessing_options.append(dict(zip(opts,p)))
print(len(preprocessing_options))        

32


In [28]:
#For each of the preprocessing options generate the train and validation sets with a 80%-20% splits and pickle the result
for n,prep_option in enumerate(preprocessing_options):
    print("Preprocessing: {}, {} ".format(n, prep_option))
    #Divide into train and test set
    trainX,trainY,testX,testY,X_test,num_vocab, features=divide_train_test(df_full, num_train_tweets,  prep_option)
    #Pickle result
    pickle.dump([trainX,trainY,testX,testY,X_test,num_vocab,features],open('train_test_prep_'+str(n),'wb'))


Preprocessing: 0, {'stop': True, 'length': True, 'slang': True, 'negative': True, 'antonyms': False} 
Preprocessing done
Preprocessing: 1, {'stop': False, 'length': True, 'slang': True, 'negative': True, 'antonyms': False} 
Preprocessing done
Preprocessing: 2, {'stop': True, 'length': False, 'slang': True, 'negative': True, 'antonyms': False} 
Preprocessing done
Preprocessing: 3, {'stop': True, 'length': True, 'slang': False, 'negative': True, 'antonyms': False} 
Preprocessing done
Preprocessing: 4, {'stop': True, 'length': True, 'slang': True, 'negative': False, 'antonyms': False} 
Preprocessing done
Preprocessing: 5, {'stop': True, 'length': True, 'slang': False, 'negative': False, 'antonyms': False} 
Preprocessing done
Preprocessing: 6, {'stop': False, 'length': True, 'slang': False, 'negative': True, 'antonyms': False} 
Preprocessing done
Preprocessing: 7, {'stop': False, 'length': True, 'slang': True, 'negative': False, 'antonyms': False} 
Preprocessing done
Preprocessing: 8, {'st

In [12]:

#Range over the preprocessings options
for n in range(len(preprocessing_options)):

    #Load the data from this preprocessing option
    trainX,trainY,testX,testY,X_test,num_vocab,features = pickle.load(open('./preprocessing_res/train_test_prep_'+str(n),'rb'))
    
    #Create LSTM model
    model=create_bidir_LSTM(num_vocab, logdir=logdir)

    tensorboard_projector(features,log_dir=logdir)

    #Training for n_epoch, using 64 samples per step and showing the validation accuracy every 1500 training steps
    model.fit(trainX, trainY,validation_set=(testX,testY), run_id='model'+str(n),n_epoch=6, batch_size=64,
          show_metric=True,snapshot_step=1500,snapshot_epoch=True)
    
    
    model.save(logdir+'/model/model_saved_'+str(n)+'.ckpt')
    acc=model.evaluate (testX, testY, batch_size=64)
    
    
    pickle.dump([acc],open('model_prep_'+str(n),'wb'))

---------------------------------
Run id: model0
Log directory: /tmp/tflearn-logs/


KeyboardInterrupt: 