In [4]:
# include useful folders
import sys

In [5]:
sys.path.append("../vendors/mtl_girnet/data_prep/")

In [82]:
# enable or disable cuda
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [83]:
import json
import h5py
import numpy as np
import glob
import random
import pandas as pd
import re
from sklearn.utils import shuffle

# nltk
import nltk

# tokenizer
from twokenize import tokenizeRawTweetText as tokenize

# for a particular dataset
from xml.dom import minidom

In [84]:
# for trying differnet types of tokenizer

# from nltk.corpus import stopwords
# from  nltk.stem import SnowballStemmer
# from tokensize_deepmoji import tokenize
# from nltk.tokenize import TweetTokenizer
# tokenizer = TweetTokenizer(a)
# from tokenizer import tokenizer
# T = tokenizer.TweetTokenizer(preserve_handles=False, preserve_hashes=False, preserve_case=False, preserve_url=False, regularize=True)
# nltk.download('stopwords')
# stop_words = stopwords.words("english")
# stemmer = SnowballStemmer("english")
# TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
# def preprocess(text, stem=False):
#     # Remove link,user and special characters
#     text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
#     tokens = []
#     for token in text.split():
#         if token not in stop_words: 
#             if stem:
#                 tokens.append(stemmer.stem(token))
#             else:
#                 tokens.append(token)
#     return tokenizer.tokenize(" ".join(tokens))

## Data Preprocessing

### Sentiment140 dataset 
https://www.kaggle.com/kazanova/sentiment140

In [85]:
# essential functions/declarations
decode_map = {0: -1, 2: 0, 4: 1}

In [86]:
#1.6 millionss tweets dataset
df = pd.read_csv('../data/'+'training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1' , names=["target", "ids", "date", "flag", "user", "text"])

In [87]:
# this is goning to take some time. chill 
df.target = df.target.apply(lambda x: decode_map[int(x)])
df.text = df.text.apply(lambda x: tokenize(x))

In [88]:
data = map( lambda x :{'sentiment': x[0] , 'tokens': x[-1] , } , df.to_numpy() )

In [89]:
en_sentiment140 = list(data)

### English-Spanish Code Mixed Data 

In [90]:
sents = {"N":-1 , "P" :1 , "NONE":0}

In [91]:
data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/cs-corpus-with-tweets_train.txt", encoding='utf-8').read().split("\n") 
data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': sents[x[1]] , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )
en_es_wssa_data_train = data

In [92]:
data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/cs-corpus-with-tweets_test.txt", encoding='utf-8').read().split("\n") 
data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': sents[x[1]] , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )
en_es_wssa_data_test = data

In [93]:
en_es_wssa_data = list(en_es_wssa_data_train) + list(en_es_wssa_data_test)

### Spanish Tweet Dataset

In [94]:
xmldoc = minidom.parse("../vendors/mtl_girnet/data_prep/data_cm_senti/general-tweets-train-tagged.xml")
tweets = xmldoc.getElementsByTagName('tweet')

sents = {"N":-1 , "P" :1 , "NEU":0 , 'NONE':0 , "P+" : 1 , "N+":-1 }


es_tass1_data = []

for i in range( len(tweets)-1) :
    if i == 6055:
        continue # bad jogar
    textt = tweets[i].getElementsByTagName('content')[0].childNodes[0].data
    words = tokenize( textt )
    sentiment = tweets[i].getElementsByTagName('polarity')[0].getElementsByTagName('value')[0].childNodes[0].data
    assert len(tweets[i].getElementsByTagName('polarity')[0].getElementsByTagName('entity'))==0
    es_tass1_data.append({'text':textt , 'tokens':words , 'sentiment': sents[sentiment] })

### Some english tweet data

In [95]:
data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/twitter4242.txt", "r", encoding="utf-8",errors='ignore').read().split("\n")[1:-1]
data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': int(np.sign(int(x[0])-int(x[1]))) , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )

en_twitter_data = list(data)

### es2_twitter_data

In [96]:
data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/1600_tweets_dev_complete.txt", encoding="utf-8").read().split("\n")[1:-1]
data += open("../vendors/mtl_girnet/data_prep/data_cm_senti/1600_tweets_test_average_complete.tsv", encoding="utf-8").read().split("\n")[1:-2]

data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': int(np.sign(int(x[0])-int(x[1]))) , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )

es2_twitter_data = list(data)

In [97]:
print("Code-Mixed: en_es_wssa_data: %d" % len(en_es_wssa_data))
print("Spanish: es2_twitter_data: %d" % len(es2_twitter_data))
print("Spanish: es_tass1_data: %d" % len(es_tass1_data))
print("English: en_twitter_data: %d" % len(en_twitter_data))
print("English: en_sentiment140: %d" %len(en_sentiment140))

Code-Mixed: en_es_wssa_data: 3062
Spanish: es2_twitter_data: 3202
Spanish: es_tass1_data: 7217
English: en_twitter_data: 4241
English: en_sentiment140: 1600000


# Load Embedding

In [98]:
import io
import numpy as np

In [99]:
## essential functions
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [100]:
## NEED TO RUN MUSE BEFORE THIS and to get this path
src_path = '../vendors//MUSE/dumped/debug/4u9hakomha/vectors-en.txt'
tgt_path = '../vendors//MUSE/dumped/debug/4u9hakomha/vectors-es.txt'
nmax = 100000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

In [101]:
src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

# Analysis embedding

In [102]:
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))

In [103]:
# printing nearest neighbors in the source space
src_word = 'cat'
get_nn(src_word, src_embeddings, src_id2word, src_embeddings, src_id2word, K=5)

Nearest neighbors of "cat":
1.0000 - cat
0.7322 - cats
0.6453 - kitten
0.6381 - dog
0.6218 - kittens


In [104]:
# printing nearest neighbors in the target space
src_word = 'cat'
get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)

Nearest neighbors of "cat":
0.6266 - gato
0.5317 - perro
0.5213 - gatito
0.4872 - gorila
0.4767 - ratoncito


In [105]:
src_embeddings.shape

(100000, 300)

# Merge Embedding

In [106]:
from keras.preprocessing import sequence
from keras.utils import to_categorical

In [107]:
def merge_embeddings(src_embeddings, tgt_embeddings):
    
    # make combined embedding mattrix
    embedding_matrix = src_embeddings.copy().tolist()
    embedding_matrix.extend(tgt_embeddings.tolist())
    embedding_matrix = np.array(embedding_matrix)
    
    # make combined id2word and word2id
    id2word = src_id2word.copy()
    word2id = src_word2id.copy()
    
    next_id = len(id2word.keys())
    counter = len(id2word.keys())
    
    to_be_removed_id = []
    common_words = []
    
    for key in tgt_id2word:
        if tgt_id2word[key] in word2id:
            to_be_removed_id.append(counter)
            common_words.append(tgt_id2word[key])
            embedding_matrix[word2id[tgt_id2word[key]]] =  (embedding_matrix[word2id[tgt_id2word[key]]] + embedding_matrix[counter])/2
        else:
            id2word[next_id] = tgt_id2word[key]
            word2id[tgt_id2word[key]] = next_id
            next_id += 1
        counter += 1
        
    embedding_matrix = np.delete(embedding_matrix, to_be_removed_id, axis=0)
        
    return embedding_matrix, id2word, word2id, common_words

In [108]:
embedding_matrix, id2word, word2id, common_words = merge_embeddings(src_embeddings, tgt_embeddings)

In [109]:
print("embedding size: ", str(embedding_matrix.shape))

embedding size:  (161832, 300)


In [110]:
print("Number of common words in both the embedding %d" % len(common_words))

Number of common words in both the embedding 38168


In [111]:
# ADD UNK
# TODO:

# Data Preprocess Part 2

In [112]:
from  nltk.stem import SnowballStemmer

In [113]:
MAX_SEQUENCE_LENGTH = 20

In [114]:
words_not_found = 0
def from_datas_to_x_y(list_data, word2id, max_seq_len=20, max_classes=3, seed=0):
    stemmer = SnowballStemmer("english")  
    words_not_found = 0
    def to_x(sample):
        global words_not_found
        x = []
        for word in sample['tokens']:
            # lower the word 
            word = word.lower()
            if word in word2id:
                x.append(word2id[word]) 
            else:
                stem = stemmer.stem(word) # find stem
                if stem in word2id:
                    x.append(word2id[stem])
                else:
                    words_not_found = words_not_found + 1
                    pass
        return x

    def to_x_y(data):
        temp =  np.array(list(map(lambda x : [to_x(x), x['sentiment']], data)))
        x = list(sequence.pad_sequences(temp[:,0], maxlen=max_seq_len))
        y = list(to_categorical(temp[:,1],num_classes=max_classes))
        return x, y
    
    x,y = [],[]
    for data in list_data:
        x_, y_ = to_x_y(data)
        print("x: %d \t y: %d" % (len(x_),len(y_)))
        x.extend(x_)
        y.extend(y_)
    x = np.array(x)
    y = np.array(y)
    x,y = shuffle(x, y, replace=True)
    
    print("Not Found words = %f" % (float(words_not_found)/(x.shape[0])))
    return x,y

In [115]:
x_test,y_test = from_datas_to_x_y([en_es_wssa_data],word2id)

x: 3062 	 y: 3062
Not Found words = 0.000000


In [116]:
def evaluate():
    print("One-Shot Code Mixed")
    x_test,y_test = from_datas_to_x_y([en_es_wssa_data],word2id)
    print(model.evaluate(x_test,y_test,batch_size=128,verbose=0))
    
    print("One-Shot Spanish: 1")
    x_test,y_test = from_datas_to_x_y([es2_twitter_data],word2id)
    print(model.evaluate(x_test,y_test,batch_size=128,verbose=0))
    
    print("One-Shot Spanish: 2")
    x_test,y_test = from_datas_to_x_y([es_tass1_data],word2id)
    print(model.evaluate(x_test,y_test,batch_size=128,verbose=0))

In [117]:
# print(x.shape)
# print(y.shape)

# Base Model

In [118]:
from keras.layers import *
from keras.models import Sequential

In [119]:
#embedding
EMBEDDING_DIM = embedding_matrix.shape[1] 
MAX_NUM_WORDS = embedding_matrix.shape[0]

In [120]:
# for cnn_lstm

# Convolution
kernel_size = 5
filters = 64
pool_size = 1
# lstm
lstm_output_size = 70

In [121]:
model = Sequential()
model.add(Embedding(MAX_NUM_WORDS,
                     EMBEDDING_DIM,
                     weights=[embedding_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=True))
# model.add(Dropout(0.2))
# model.add(Conv1D(filters,
#                  kernel_size,
#                  activation='relu',
#                  strides=1))
# model.add(MaxPooling1D(pool_size=pool_size))
# model.add(LSTM(lstm_output_size))
model.add(Bidirectional(LSTM(512, dropout=0.1, recurrent_dropout=0.1)))
model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1])

In [122]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 300)           48549600  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 1024)              3330048   
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 3075      
_________________________________________________________________
activation_3 (Activation)    (None, 3)                 0         
Total params: 51,882,723
Trainable params: 51,882,723
Non-trainable params: 0
_________________________________________________________________


In [123]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
earlystop = EarlyStopping(monitor='val_f1', min_delta=0.01, patience=4, \
                          verbose=1, mode='auto', restore_best_weights=True)
checkpointer = ModelCheckpoint(filepath='weights.{epoch:02d}-{val_f1:.2f}.hdf5', verbose=1, save_best_only=True, save_weights_only=True, monitor='val_f1')

# Just english

In [124]:
x_train, y_train = from_datas_to_x_y([en_sentiment140,en_twitter_data],word2id)

x: 1600000 	 y: 1600000
x: 4241 	 y: 4241
Not Found words = 0.000000


In [125]:
get_class_weight(y_train)

{0: 273.9482581967213, 1: 0.6673159957071905, 2: 0.6676417599622448}

In [126]:
history = model.fit( x_train , y_train, epochs=10, batch_size=648, validation_split=0.3, shuffle=True, callbacks=[earlystop,checkpointer])

Train on 1122968 samples, validate on 481273 samples
Epoch 1/10

Epoch 00001: val_f1 improved from inf to 0.81003, saving model to weights.01-0.81.hdf5
Epoch 2/10

Epoch 00002: val_f1 did not improve from 0.81003
Epoch 3/10

Epoch 00003: val_f1 did not improve from 0.81003
Epoch 4/10

Epoch 00004: val_f1 did not improve from 0.81003
Epoch 5/10
Restoring model weights from the end of the best epoch

Epoch 00005: val_f1 did not improve from 0.81003
Epoch 00005: early stopping


In [127]:
evaluate()

One-Shot Code Mixed
x: 3062 	 y: 3062
Not Found words = 0.000000
[3.7075264187594166, 0.40855649919544146, 0.40830850908757815]
One-Shot Spanish: 1
x: 3202 	 y: 3202
Not Found words = 0.000000
[2.3992607895654863, 0.4216114928169894, 0.4099066541911214]
One-Shot Spanish: 2
x: 7217 	 y: 7217
Not Found words = 0.000000
[2.1086025899576812, 0.4521269225068282, 0.4442575112577784]


# Just Spanish

In [128]:
x_train_es, y_train_es =  from_datas_to_x_y([es_tass1_data, es2_twitter_data], word2id=word2id)

x: 7217 	 y: 7217
x: 3202 	 y: 3202
Not Found words = 0.000000


In [129]:
get_class_weight(y_train_es)

{0: 1.0385765550239234, 1: 0.8748110831234257, 2: 1.1185185185185185}

In [130]:
history = model.fit(x_train_es, y_train_es, epochs=10, batch_size=128, validation_split=0.1, shuffle=True, callbacks=[earlystop,checkpointer])

Train on 9377 samples, validate on 1042 samples
Epoch 1/10

Epoch 00001: val_f1 improved from 0.81003 to 0.60796, saving model to weights.01-0.61.hdf5
Epoch 2/10

Epoch 00002: val_f1 did not improve from 0.60796
Epoch 3/10

Epoch 00003: val_f1 did not improve from 0.60796
Epoch 4/10

Epoch 00004: val_f1 did not improve from 0.60796
Epoch 5/10
Restoring model weights from the end of the best epoch

Epoch 00005: val_f1 improved from 0.60796 to 0.60421, saving model to weights.05-0.60.hdf5
Epoch 00005: early stopping


In [131]:
evaluate()

One-Shot Code Mixed
x: 3062 	 y: 3062
Not Found words = 0.000000
[1.0251387802313388, 0.47028086239570577, 0.4191458797158952]
One-Shot Spanish: 1
x: 3202 	 y: 3202
Not Found words = 0.000000
[0.7380538467762249, 0.6714553404122423, 0.6372328620266721]
One-Shot Spanish: 2
x: 7217 	 y: 7217
Not Found words = 0.000000
[0.6001646815285188, 0.754745739069906, 0.7380338265110585]


## Both English and Spanish

In [132]:
x_train_both, y_train_both = from_datas_to_x_y([en_twitter_data,es2_twitter_data,es_tass1_data],word2id)

x: 4241 	 y: 4241
x: 3202 	 y: 3202
x: 7217 	 y: 7217
Not Found words = 0.000000


In [133]:
model.fit(x_train_both,y_train_both,validation_data=(x_test,y_test),batch_size=128, epochs=10, shuffle=True, callbacks=[earlystop,checkpointer])

Train on 14660 samples, validate on 3062 samples
Epoch 1/10

Epoch 00001: val_f1 improved from 0.60421 to 0.47652, saving model to weights.01-0.48.hdf5
Epoch 2/10

Epoch 00002: val_f1 did not improve from 0.47652
Epoch 3/10

Epoch 00003: val_f1 did not improve from 0.47652
Epoch 4/10

Epoch 00004: val_f1 did not improve from 0.47652
Epoch 5/10
Restoring model weights from the end of the best epoch

Epoch 00005: val_f1 did not improve from 0.47652
Epoch 00005: early stopping


<keras.callbacks.History at 0x7f30e3b49f60>

In [134]:
evaluate()

One-Shot Code Mixed
x: 3062 	 y: 3062
Not Found words = 0.000000
[0.9686988688309468, 0.5411495764141843, 0.47658517440886844]
One-Shot Spanish: 1
x: 3202 	 y: 3202
Not Found words = 0.000000
[0.5271762280744139, 0.7960649594003748, 0.7858611731734744]
One-Shot Spanish: 2
x: 7217 	 y: 7217
Not Found words = 0.000000
[0.3729386384614009, 0.8654565610217638, 0.8553500539199028]


# Cross-Lingual

In [12]:
from keras import backend as K


def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        Only computes a batch-wise average of recall.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

Using TensorFlow backend.


In [13]:
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
def get_class_weight(y):
    """
    Used from: https://stackoverflow.com/a/50695814
    TODO: check validity and 'balanced' option
    :param y: A list of one-hot-encoding labels [[0,0,1,0],[0,0,0,1],..]
    :return: class-weights to be used by keras model.fit(.. class_weight="") -> {0:0.52134, 1:1.adas..}
    """
    y_integers = np.argmax(y, axis=1)
    class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers)
    d_class_weights = dict(enumerate(class_weights))
    return d_class_weights

In [None]:
# doesn't work

In [29]:
from dlblocks import text
from dlblocks.pyutils import mapArrays , loadJson , saveJson , selectKeys , oneHotVec , padList
from dlblocks.pyutils import int64Arr , floatArr

In [None]:
vocab = text.Vocabulary()

for d in es_tass1_data + en_es_wssa_data + en_twitter_data + es2_twitter_data :
    vocab.add_words( d['tokens']  )

    
vocab.keepTopK(25000)



maxSentenceL = 150

def vecc( d ):
    ret = {}
    words   = d['tokens']
    wordids = map( vocab , words )
    ret['sentence'] = int64Arr( padList( wordids , maxSentenceL , 0 , 'left') )
    ret['sentiment_val'] =  floatArr( d['sentiment'] )
    ret['sentiment_id'] =  int64Arr( d['sentiment'] + 1 )
    ret['sentiment_onehot'] =  floatArr( oneHotVec( d['sentiment']+1 , 3  ) )

    return ret





en_es_wssa_data_train_arr = mapArrays( en_es_wssa_data_train , vecc )
en_es_wssa_data_test_arr = mapArrays( en_es_wssa_data_test , vecc )

en_twitter_data_train_arr = mapArrays( en_twitter_data , vecc )
es_tass1_datatrain_arr = mapArrays( es_tass1_data , vecc )

datasets = {"en_es_wssa_data_train_arr":en_es_wssa_data_train_arr ,
           "en_es_wssa_data_test_arr":en_es_wssa_data_test_arr ,
           "en_twitter_data_train_arr":en_twitter_data_train_arr ,
           "es_tass1_datatrain_arr": es_tass1_datatrain_arr }



outFNN = "../data/senti_prepped.h5"

f = h5py.File(outFNN , "w")
for kk in datasets.keys():
    f.create_group( kk  )
    for k in datasets[kk].keys():
        f[ kk ].create_dataset( k , data=datasets[kk][k] )

print "HDF5 file created !"