In [115]:
# include useful folders
import sys

import os
os.environ["CUDA_VISIBLE_DEVICES"]=""

sys.path.append("../vendors/mtl_girnet/data_prep/")

import json
import h5py
import numpy as np
import glob
import random
import pandas as pd
import re
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix
from matplotlib import pyplot as plt

# nltk
import nltk

# tokenizer
from twokenize import tokenizeRawTweetText as tokenize

# for a particular dataset
from xml.dom import minidom

In [116]:

### SemEval 2017 Task A

df = pd.read_csv("../data/datastories-semeval2017-task4/dataset/Subtask_A/4A-English/SemEval2017-task4-dev.subtask-A.english.INPUT.txt", sep="\t", header=None)

decode_map = {"negative": -1, "neutral": 0, "positive": 1}

df[1] = df[1].apply(lambda x: decode_map[x])
df[2] = df[2].apply(lambda x: tokenize(x))

data = map( lambda x :{'sentiment': x[1] , 'tokens': x[2] , 'text': ' '.join(x[2])} , df.to_numpy() )

en_semeval_17 = list(data)


### English-Spanish Code Mixed Data 

sents = {"N":-1 , "P" :1 , "NONE":0}

data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/cs-corpus-with-tweets_train.txt", encoding='utf-8').read().split("\n") 
data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': sents[x[1]] , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )
en_es_wssa_data_train = list(data)

data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/cs-corpus-with-tweets_test.txt", encoding='utf-8').read().split("\n") 
data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': sents[x[1]] , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )
en_es_wssa_data_test = list(data)

en_es_wssa_data = list(en_es_wssa_data_train) + list(en_es_wssa_data_test)

### Spanish Tweet Dataset

xmldoc = minidom.parse("../vendors/mtl_girnet/data_prep/data_cm_senti/general-tweets-train-tagged.xml")
tweets = xmldoc.getElementsByTagName('tweet')

sents = {"N":-1 , "P" :1 , "NEU":0 , 'NONE':0 , "P+" : 1 , "N+":-1 }


es_tass1_data = []

for i in range( len(tweets)-1) :
    if i == 6055:
        continue # bad jogar
    textt = tweets[i].getElementsByTagName('content')[0].childNodes[0].data
    words = tokenize( textt )
    sentiment = tweets[i].getElementsByTagName('polarity')[0].getElementsByTagName('value')[0].childNodes[0].data
    assert len(tweets[i].getElementsByTagName('polarity')[0].getElementsByTagName('entity'))==0
    es_tass1_data.append({'text':textt , 'tokens':words , 'sentiment': sents[sentiment] })

### Some english tweet data

data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/twitter4242.txt", "r", encoding="utf-8",errors='ignore').read().split("\n")[1:-1]
data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': int(np.sign(int(x[0])-int(x[1]))) , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )

en_twitter_data = list(data)

### es2_twitter_data

data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/1600_tweets_dev_complete.txt", encoding="utf-8").read().split("\n")[1:-1]
data += open("../vendors/mtl_girnet/data_prep/data_cm_senti/1600_tweets_test_average_complete.tsv", encoding="utf-8").read().split("\n")[1:-2]

data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': int(np.sign(int(x[0])-int(x[1]))) , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )

es2_twitter_data = list(data)

def get_y(data):
    from keras.utils import to_categorical
    y = []
    for row in data:
        y.append(int(row['sentiment']))
    y = to_categorical(y,num_classes=3)
    return y


print("Code-Mixed: en_es_wssa_data: %d" % len(en_es_wssa_data))
print("Spanish: es2_twitter_data: %d" % len(es2_twitter_data))
print("Spanish: es_tass1_data: %d" % len(es_tass1_data))
print("English: en_twitter_data: %d" % len(en_twitter_data))
# print("English: en_sentiment140: %d" %len(en_sentiment140))
en_es_y =  get_y(en_es_wssa_data)
en_es_y_train =  get_y(en_es_wssa_data_train)
en_es_y_test =  get_y(en_es_wssa_data_test)
es_twitter_y = get_y(es2_twitter_data)
es_tass_y = get_y(es_tass1_data)
en_twitter_y = get_y(en_twitter_data)
en_semeval_17_y = get_y(en_semeval_17)
# en_sentiment140_y = get_y(en_sentiment140)

Code-Mixed: en_es_wssa_data: 3062
Spanish: es2_twitter_data: 3202
Spanish: es_tass1_data: 7217
English: en_twitter_data: 4241


In [117]:
from keras import backend as K


def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        Only computes a batch-wise average of recall.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
def get_class_weight(y):
    """
    Used from: https://stackoverflow.com/a/50695814
    TODO: check validity and 'balanced' option
    :param y: A list of one-hot-encoding labels [[0,0,1,0],[0,0,0,1],..]
    :return: class-weights to be used by keras model.fit(.. class_weight="") -> {0:0.52134, 1:1.adas..}
    """
    y_integers = np.argmax(y, axis=1)
    class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers)
    d_class_weights = dict(enumerate(class_weights))
    return d_class_weights

In [119]:
# ! pip install bpemb

In [1]:
from bpemb import BPEmb
multibpemb = BPEmb(lang="multi", vs=1000000, dim=300)

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [None]:
multibpemb.

In [125]:
multibpemb.embed("sup dfsdg").shape

(3, 300)

In [127]:
max_len = 32
zero_vector = [0 for _ in range(300)]
def get_x(data_):
    x_  = []
    for sent in data_:
        pred = list(multibpemb.embed(sent['text']))
        if len(pred) >= 32:
            pred = pred[:32]
        else:
            counter = len(pred)
            while counter < max_len:
                pred.append(zero_vector)
                counter = counter + 1
        x_.append(pred)
    return np.array(x_)
en_es_x =  get_x(en_es_wssa_data)
es_twitter_x = get_x(es2_twitter_data)
es_tass_x = get_x(es_tass1_data)
en_twitter_x = get_x(en_twitter_data)
en_semeval_17_x = get_x(en_semeval_17)
en_es_x_train =  get_x(en_es_wssa_data_train)
en_es_x_test =  get_x(en_es_wssa_data_test)

In [26]:
import fasttext

In [27]:
embed = fasttext.load_model('../vendors/language-models/all_p_fasttext.bin')




In [101]:
max_len = 32
zero_vector = [0 for _ in range(100)]
def get_x(data_):
#     x_  = []
#     for sent in data_:
#         x_.append(embed.get_sentence_vector(sent['text'].replace("\n"," ")))
#     return np.array(x_)
    x_  = []
    for sent in data_:
        tokenised = fasttext.tokenize(sent['text'])
        sent_vector = []
        counter = 0
        for token in tokenised:
            if counter >= max_len:
                break
            else:
                sent_vector.append(embed[token])
                counter = counter + 1
        
        if counter < max_len:
            sent_vector.append(embed['</s>'])
            counter = counter + 1
                               
        while counter < max_len:
            sent_vector.append(zero_vector)
            counter = counter + 1
            
        x_.append(sent_vector)
        
    return np.array(x_)

en_es_x =  get_x(en_es_wssa_data)
es_twitter_x = get_x(es2_twitter_data)
es_tass_x = get_x(es_tass1_data)
en_twitter_x = get_x(en_twitter_data)
en_semeval_17_x = get_x(en_semeval_17)
en_es_x_train =  get_x(en_es_wssa_data_train)
en_es_x_test =  get_x(en_es_wssa_data_test)

In [129]:
from keras.layers import *
from keras.models import Sequential
from keras.preprocessing import sequence

In [38]:
model = Sequential()
model.add(Dense(50, input_shape=(100,)))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy', f1])

In [130]:
model = Sequential()
model.add(LSTM(150, dropout=0.3, input_shape=(32, 300), recurrent_dropout=0.3))
model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1])

In [131]:
history = model.fit(en_twitter_x, en_twitter_y, epochs=10, shuffle=True, validation_split=0.2)

Train on 3392 samples, validate on 849 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [132]:
history = model.fit(en_semeval_17_x, en_semeval_17_y, epochs=10, validation_split=0.2, shuffle=True)

Train on 16505 samples, validate on 4127 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [133]:
history = model.fit(es_tass_x, es_tass_y, epochs=10, validation_split=0.2, shuffle=True)

Train on 5773 samples, validate on 1444 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [134]:
history = model.fit(es_twitter_x, es_twitter_y, epochs=20, validation_split=0.2, shuffle=True)

Train on 2561 samples, validate on 641 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [135]:
x = np.concatenate([en_semeval_17_x, en_twitter_x, es_tass_x, es_twitter_x])
y = np.concatenate([en_semeval_17_y, en_twitter_y, es_tass_y, es_twitter_y])

In [136]:
history = model.fit(x, y, epochs=30, validation_data=(en_es_x_train,en_es_y_train), shuffle=True)

Train on 35292 samples, validate on 2449 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30

KeyboardInterrupt: 

In [137]:
model.evaluate(en_es_x_test, en_es_y_test)



[1.1475409948028514, 0.5285481240290413, 0.5210127912182209]

In [138]:
history = model.fit(en_es_x_train, en_es_y_train, epochs=20, validation_data=(en_es_x_test, en_es_y_test), shuffle=True)

Train on 2449 samples, validate on 613 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [139]:
model.evaluate(en_es_x_test, en_es_y_test)



[1.1387531876952963, 0.6362153345181153, 0.6377271510065088]