In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

In [2]:
### include useful folders
import sys

sys.path.append("../vendors/mtl_girnet/data_prep/")

import json
import h5py
import numpy as np
import glob
import random
import pandas as pd
import re
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix
from matplotlib import pyplot as plt

# nltk
import nltk

# tokenizer
from twokenize import tokenizeRawTweetText as tokenize

# for a particular dataset
from xml.dom import minidom

In [3]:

### SemEval 2017 Task A

df = pd.read_csv("../data/datastories-semeval2017-task4/dataset/Subtask_A/4A-English/SemEval2017-task4-dev.subtask-A.english.INPUT.txt", sep="\t", header=None)

decode_map = {"negative": -1, "neutral": 0, "positive": 1}

df[1] = df[1].apply(lambda x: decode_map[x])
df[2] = df[2].apply(lambda x: tokenize(x))

data = map( lambda x :{'sentiment': x[1] , 'tokens': x[2] , 'text': ' '.join(x[2])} , df.to_numpy() )

en_semeval_17 = list(data)


### English-Spanish Code Mixed Data 

sents = {"N":-1 , "P" :1 , "NONE":0}

data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/cs-corpus-with-tweets_train.txt", encoding='utf-8').read().split("\n") 
data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': sents[x[1]] , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )
en_es_wssa_data_train = list(data)

data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/cs-corpus-with-tweets_test.txt", encoding='utf-8').read().split("\n") 
data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': sents[x[1]] , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )
en_es_wssa_data_test = list(data)

en_es_wssa_data = list(en_es_wssa_data_train) + list(en_es_wssa_data_test)

### Spanish Tweet Dataset

xmldoc = minidom.parse("../vendors/mtl_girnet/data_prep/data_cm_senti/general-tweets-train-tagged.xml")
tweets = xmldoc.getElementsByTagName('tweet')

sents = {"N":-1 , "P" :1 , "NEU":0 , 'NONE':0 , "P+" : 1 , "N+":-1 }


es_tass1_data = []

for i in range( len(tweets)-1) :
    if i == 6055:
        continue # bad jogar
    textt = tweets[i].getElementsByTagName('content')[0].childNodes[0].data
    words = tokenize( textt )
    sentiment = tweets[i].getElementsByTagName('polarity')[0].getElementsByTagName('value')[0].childNodes[0].data
    assert len(tweets[i].getElementsByTagName('polarity')[0].getElementsByTagName('entity'))==0
    es_tass1_data.append({'text':textt , 'tokens':words , 'sentiment': sents[sentiment] })

### Some english tweet data

data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/twitter4242.txt", "r", encoding="utf-8",errors='ignore').read().split("\n")[1:-1]
data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': int(np.sign(int(x[0])-int(x[1]))) , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )

en_twitter_data = list(data)

### es2_twitter_data

data = open("../vendors/mtl_girnet/data_prep/data_cm_senti/1600_tweets_dev_complete.txt", encoding="utf-8").read().split("\n")[1:-1]
data += open("../vendors/mtl_girnet/data_prep/data_cm_senti/1600_tweets_test_average_complete.tsv", encoding="utf-8").read().split("\n")[1:-2]

data = map( lambda x : x.split("\t") , data )
data = map( lambda x :{'sentiment': int(np.sign(int(x[0])-int(x[1]))) , 'tokens': tokenize(x[2]) , 'text': x[2] } , data )

es2_twitter_data = list(data)

def get_y(data):
    from keras.utils import to_categorical
    y = []
    for row in data:
        y.append(int(row['sentiment']))
    y = to_categorical(y,num_classes=3)
    return y


print("Code-Mixed: en_es_wssa_data: %d" % len(en_es_wssa_data))
print("Spanish: es2_twitter_data: %d" % len(es2_twitter_data))
print("Spanish: es_tass1_data: %d" % len(es_tass1_data))
print("English: en_twitter_data: %d" % len(en_twitter_data))
# print("English: en_sentiment140: %d" %len(en_sentiment140))
en_es_y =  get_y(en_es_wssa_data)
en_es_y_train =  get_y(en_es_wssa_data_train)
en_es_y_test =  get_y(en_es_wssa_data_test)
es_twitter_y = get_y(es2_twitter_data)
es_tass_y = get_y(es_tass1_data)
en_twitter_y = get_y(en_twitter_data)
en_semeval_17_y = get_y(en_semeval_17)
# en_sentiment140_y = get_y(en_sentiment140)

Code-Mixed: en_es_wssa_data: 3062
Spanish: es2_twitter_data: 3202
Spanish: es_tass1_data: 7217
English: en_twitter_data: 4241


Using TensorFlow backend.


In [4]:
# len(en_semeval_17)

In [5]:
# sentiment_analysis = []

In [6]:
# data = en_es_wssa_data_train
# lang = "cm"
# for sent in data:
#     sentiment_analysis.append("\t".join([sent['text'], str(sent['sentiment']), lang, "\n"]))

In [7]:
# with open("train.txt", "w") as f:
#     f.writelines(sentiment_analysis)

In [8]:
from tensorflow.keras import backend as K


def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        Only computes a batch-wise average of recall.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
def get_class_weight(y):
    """
    Used from: https://stackoverflow.com/a/50695814
    TODO: check validity and 'balanced' option
    :param y: A list of one-hot-encoding labels [[0,0,1,0],[0,0,0,1],..]
    :return: class-weights to be used by keras model.fit(.. class_weight="") -> {0:0.52134, 1:1.adas..}
    """
    y_integers = np.argmax(y, axis=1)
    class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers)
    d_class_weights = dict(enumerate(class_weights))
    return d_class_weights

from keras import backend as K
from keras import losses

def loss_ordinal(y_true, y_pred):
    weights = K.cast(K.abs(K.argmax(y_true, axis=1) - K.argmax(y_pred, axis=1))/(K.int_shape(y_pred)[1] - 1), dtype='float32')
    return (1.0 + weights) * losses.categorical_crossentropy(y_true, y_pred)

In [9]:
# ! pip install bpemb

In [10]:
from bpemb import BPEmb
multibpemb = BPEmb(lang="multi", vs=1000000, dim=200)

Setting dim=300 for multilingual BPEmb


paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [11]:
weight_matrix = multibpemb.vectors

In [12]:
multibpemb

BPEmb(lang=multi, vs=1000000, dim=300)

In [13]:
max_len = 32
zero_vector = [0 for _ in range(300)]
def get_x(data_):
    x_  = []
    for sent in data_:
        pred = list(multibpemb.embed(sent['text']))
        if len(pred) >= 32:
            pred = pred[:32]
        else:
            counter = len(pred)
            while counter < max_len:
                pred.append(zero_vector)
                counter = counter + 1
        x_.append(pred)
    return np.array(x_)
en_es_x =  get_x(en_es_wssa_data)
es_twitter_x = get_x(es2_twitter_data)
es_tass_x = get_x(es_tass1_data)
en_twitter_x = get_x(en_twitter_data)
en_semeval_17_x = get_x(en_semeval_17)
en_es_x_train =  get_x(en_es_wssa_data_train)
en_es_x_test =  get_x(en_es_wssa_data_test)

In [199]:
# GET Y PADDED TOKENS NUMBER
max_len = 32
def get_x(data_):
    x_  = []
    for sent in data_:
        pred = list(multibpemb.encode_ids(sent['text']))
        if len(pred) >= 32:
            pred = pred[:32]
        else:
            counter = len(pred)
            while counter < max_len:
                pred.append(0)
                counter = counter + 1
        x_.append(pred)
    return np.array(x_)
en_es_x =  get_x(en_es_wssa_data)
es_twitter_x = get_x(es2_twitter_data)
es_tass_x = get_x(es_tass1_data)
en_twitter_x = get_x(en_twitter_data)
en_semeval_17_x = get_x(en_semeval_17)
en_es_x_train =  get_x(en_es_wssa_data_train)
en_es_x_test =  get_x(en_es_wssa_data_test)

In [32]:
import fasttext

In [33]:
embed = fasttext.load_model('../vendors/language-models/all_p_fasttext.bin')




In [34]:
max_len = 32
zero_vector = [0 for _ in range(100)]
def get_x(data_):
#     x_  = []
#     for sent in data_:
#         x_.append(embed.get_sentence_vector(sent['text'].replace("\n"," ")))
#     return np.array(x_)
    x_  = []
    for sent in data_:
        tokenised = fasttext.tokenize(sent['text'])
        sent_vector = []
        counter = 0
        for token in tokenised:
            if counter >= max_len:
                break
            else:
                sent_vector.append(embed[token])
                counter = counter + 1
        
        if counter < max_len:
            sent_vector.append(embed['</s>'])
            counter = counter + 1
                               
        while counter < max_len:
            sent_vector.append(zero_vector)
            counter = counter + 1
            
        x_.append(sent_vector)
        
    return np.array(x_)

en_es_x =  get_x(en_es_wssa_data)
es_twitter_x = get_x(es2_twitter_data)
es_tass_x = get_x(es_tass1_data)
en_twitter_x = get_x(en_twitter_data)
en_semeval_17_x = get_x(en_semeval_17)
en_es_x_train =  get_x(en_es_wssa_data_train)
en_es_x_test =  get_x(en_es_wssa_data_test)

In [15]:
from keras.layers import *
from keras.models import Sequential
from keras.preprocessing import sequence
from attention_lstm import AttentionWithContext

In [None]:
model = Sequential()
model.add(Dense(50, input_shape=(100,)))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy', f1])

In [16]:
model = Sequential()
model.add(Bidirectional(LSTM(50, dropout=0.3, input_shape=(32, 300), recurrent_dropout=0.3, return_sequences=True)))
model.add(AttentionWithContext())
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss=loss_ordinal,
              optimizer='adam',
              metrics=['accuracy', f1])

Instructions for updating:
Colocations handled automatically by placer.


In [None]:
# !pip install keras-self-attention
from keras_self_attention import SeqSelfAttention

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(50, input_shape=(None, 32, 100))))
# model.add(SeqSelfAttention(attention_activation='sigmoid'))
# model.add(Flatten())
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss=loss_ordinal,
              optimizer='adam',
              metrics=['accuracy', f1])

In [None]:
# Convolution
kernel_size = 5
filters = 64
pool_size = 4

model = Sequential()
model.add(Conv1D(3,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1, input_shape=(32, 300)))
# model.add(MaxPooling1D(pool_size=pool_size))
# model.add(Conv1D(5,
#                  64,
#                  padding='valid',
#                  activation='relu',
#                  strides=1))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(50, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dropout(0.3))
model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1])

In [20]:
es = EarlyStopping(monitor='val_f1', mode='max', verbose=1, patience=5, restore_best_weights=True)

In [None]:
# history = model.fit(en_es_x_train, en_es_y_train, epochs=10, shuffle=True, validation_data=(en_es_x_test
#                                                                                             ,en_es_y_test))

In [None]:
# model.evaluate(X_test, y_test)

In [21]:
history = model.fit(en_semeval_17_x, en_semeval_17_y, epochs=20, initial_epoch=0, validation_split=0.2, shuffle=True, callbacks=[es])

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 16505 samples, validate on 4127 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
model.evaluate(en_es_x_test, en_es_y_test)



[2.5036796031923902, 0.5236541600639629, 0.5194763644290009]

In [None]:
history = model.fit(es_tass_x, es_tass_y, epochs=35, initial_epoch=20, validation_split=0.2, shuffle=True, callbacks=[es])

Train on 5773 samples, validate on 1444 samples
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35

In [None]:
model.evaluate(en_es_x_test, en_es_y_test)

In [None]:
history = model.fit(es_twitter_x, es_twitter_y, epochs=40, initial_epoch=35, validation_split=0.2, shuffle=True,  callbacks=[es])

In [None]:
model.evaluate(en_es_x_test, en_es_y_test)

In [None]:
x = np.concatenate([en_semeval_17_x, en_twitter_x, es_tass_x, es_twitter_x])
y = np.concatenate([en_semeval_17_y, en_twitter_y, es_tass_y, es_twitter_y])

In [None]:
history = model.fit(x, y, epochs=60, initial_epoch=40,validation_data=(en_es_x_train,en_es_y_train), shuffle=True, callbacks=[es])

In [None]:
model.evaluate(en_es_x_test, en_es_y_test)

In [None]:
history = model.fit(en_es_x_train, en_es_y_train, epochs=80, initial_epoch=60, validation_data=(en_es_x_test, en_es_y_test), shuffle=True,  callbacks=[es])

In [None]:
model.evaluate(en_es_x_test, en_es_y_test)

In [None]:
import tensorflow as tf
tf.reset_default_graph()

# MTL 

In [18]:
import tensorflow.keras as keras
from keras.layers import *
from keras.models import Model, Sequential
from attention_lstm import AttentionWithContext
from keras.callbacks import *
# from keras_self_attention import SeqSelfAttention

In [262]:
class GiretTwoCell(keras.layers.Layer):

    def __init__(self, cell_1 , cell_2 , nHidden , **kwargs):
        self.cell_1 = cell_1
        self.cell_2 = cell_2
        self.nHidden = nHidden
        self.state_size = [nHidden,nHidden]
        super(GiretTwoCell, self).__init__(**kwargs)

    def build(self, input_shape):
        
        nHidden = self.nHidden
        
        input_shape_n = ( input_shape[0] , input_shape[1]- 2 )
#         print "pp", input_shape_n
        
#         self.cell_1.build(input_shape_n)
#         self.cell_2.build(input_shape_n)
        
        self._trainable_weights += ( self.cell_1.trainable_weights )
        self._trainable_weights += ( self.cell_2.trainable_weights )
        
        self._non_trainable_weights += (  self.cell_1.non_trainable_weights )
        self._non_trainable_weights += (  self.cell_2.non_trainable_weights )
        
        self.built = True

    def call(self, inputs, states):
        
        nHidden = self.nHidden
        
        gate_val_1 = inputs[ : , 0:1]
        gate_val_2 = inputs[ : , 1:2]
        
        inputs  = inputs[ : , 2: ]
                
        gate_val_1 = K.repeat_elements(gate_val_1 , nHidden , -1 ) # shape # bs , hidden
        gate_val_2 = K.repeat_elements(gate_val_2 , nHidden , -1 ) # shape # bs , hidden
        
        _ , [h1 , c1 ]  = self.cell_1.call( inputs , states )
        _ , [h2 , c2 ]  = self.cell_2.call( inputs , states )
        
        h = gate_val_1*h1 + gate_val_2*h2  + (1 - gate_val_1 -  gate_val_2 )*states[0]
        c = gate_val_1*c1 + gate_val_2*c2  + (1 - gate_val_1 -  gate_val_2 )*states[1]
        
        return h, [h , c ]

In [279]:
hidden = 100
dims = 300

rnn_en = LSTM(hidden, name='en_lstm', recurrent_dropout=0.3, dropout=0.3)
rnn_hi = LSTM(hidden, name='es_lstm', recurrent_dropout=0.3, dropout=0.3)

       
# en
inp_en = Input(shape=(32, dims))
rnn_en_ = rnn_en(inp_en)
x = Dropout(0.3)(rnn_en_)
out_en = Dense(3, activation='softmax', name='en')(x)


# es
inp_hi = Input(shape=(32, dims))
rnn_hi_ = rnn_hi(inp_hi)
x = Dropout(0.3)(rnn_hi_)
out_hi = Dense(3, activation='softmax', name='es')(x)


cell_combined = GiretTwoCell(rnn_en.cell , rnn_hi.cell , hidden)

        
inp_enhi = Input(shape=(32, dims))
x = inp_enhi
x_att = x
x_att = Bidirectional(LSTM(32 , return_sequences=True, recurrent_dropout=0.3, dropout=0.3))( x )
bider_h = x_att 
x_att = Dropout(0.3)(x_att)
x_att = TimeDistributed(Dense(3, activation='softmax') )(x_att)
x_att = Lambda(lambda x : x[... , 1: ])(x_att)

x = Concatenate(-1)([x_att , x ])

x =  RNN(cell_combined, name='damn')(x)
# x = AttentionWithContext()(x)
out_enhi = Dense(3, activation='softmax', name='cm')(x)
        
model = Model( [inp_en , inp_hi , inp_enhi  ] , [ out_en , out_hi , out_enhi ] ) 

In [238]:
hidden = 100
numwords = weight_matrix.shape[0]
hidden_emd_dim = 300


embed = Embedding(numwords, hidden_emd_dim, weights=[weight_matrix], mask_zero=True)
# conv1 = Conv1D(64, 3, activation='relu', padding='valid',strides=1)
# pool1 = MaxPooling1D(2)
# conv2 = Conv1D(64, 5, activation='relu', padding='valid', strides=1)
# pool2 = MaxPooling1D(2)
# conv3 = Conv1D(128, 5, activation='relu', padding='valid', strides=1)
# pool3 = MaxPooling1D(2)
# conv3 = Conv1D(128, 5, activation='relu', padding='valid',strides=1)
# pool3 = MaxPooling1D(35)  # global max pooling


rnn_en = LSTM(hidden, name='en_lstm', recurrent_dropout=0.3, dropout=0.3)
rnn_hi = LSTM(hidden, name="hi_lstm", recurrent_dropout=0.3, dropout=0.3)
       
# en
inp_en = Input((None, ))
x = embed(inp_en)
# x = conv1(x)
# x = pool1(x)
# x = conv2(x)
# x = pool2(x)
# x = conv3(x)
# x = pool3(x)
x = Dropout(0.3)(x)
x = rnn_en(x)
out_en = Dense(3, activation='softmax')(x)


# es
inp_hi = Input((None, ))
x = embed(inp_hi)
# x = conv1(x)
# x = pool1(x)
# x = conv2(x)
# x = pool2(x)
# x = conv3(x)
# x = pool3(x)
x = Dropout(0.3)(x)
x = rnn_hi( x )
out_hi = Dense(3, activation='softmax')(x)


cell_combined = GiretTwoCell(rnn_hi.cell , rnn_en.cell , hidden)

inp_enhi = Input((None, ))
x = embed(inp_enhi)
# x = conv1(x)
# x = pool1(x)
# x = conv2(x)
# x = pool2(x)
# x = conv3(x)
# x = pool3(x)
x_att = x
x_att = Bidirectional(LSTM(32 , return_sequences=True, recurrent_dropout=0.3, dropout=0.3))( x )
bider_h = x_att 
x_att = Dropout(0.3)(x_att)
x_att = TimeDistributed(Dense(3, activation='softmax') )(x_att)
x_att = Lambda(lambda x : x[... , 1: ])(x_att)

x = Concatenate(-1)([x_att , x ])

x =  RNN(cell_combined, name='damn')(x)
# x = AttentionWithContext()(x)
out_enhi = Dense(3, activation='softmax', name='cm')(x)
        
model = Model( [inp_hi , inp_en , inp_enhi  ] , [ out_hi , out_en , out_enhi ] ) 

In [280]:
es = EarlyStopping(monitor='val_cm_f1', mode='max', verbose=1, patience=5, restore_best_weights=True)

In [281]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[f1])

In [282]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_66 (InputLayer)           (None, 32, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_22 (Bidirectional (None, 32, 64)       85248       input_66[0][0]                   
__________________________________________________________________________________________________
dropout_62 (Dropout)            (None, 32, 64)       0           bidirectional_22[0][0]           
__________________________________________________________________________________________________
time_distributed_18 (TimeDistri (None, 32, 3)        195         dropout_62[0][0]                 
__________________________________________________________________________________________________
input_64 (

In [283]:
en_x = np.concatenate([en_twitter_x, en_semeval_17_x])
en_y = np.concatenate([en_twitter_y, en_semeval_17_y])
es_x = np.concatenate([es_twitter_x, es_tass_x])
es_y = np.concatenate([es_twitter_y, es_tass_y])

In [284]:
# all_x = np.concatenate([en_semeval_17_x, en_twitter_x, es_tass_x, es_twitter_x, en_es_x_train])
# all_y = np.concatenate([en_semeval_17_y, en_twitter_y, es_tass_y, es_twitter_y, en_es_y_train])
all_x = np.concatenate([en_semeval_17_x, es_tass_x, es_twitter_x])
all_y = np.concatenate([en_semeval_17_y, es_tass_y, es_twitter_y])

In [285]:
import random
def train_generator(batch_size=4, lang="cm"):
    b = batch_size
    
    while True:
        
#         n3 = np.random.randint(0, en_es_x_train.shape[0] - batch_size, batch_size)


        if lang is "cm": 
            n1 = np.random.randint(0, en_x.shape[0] - batch_size, batch_size)
            n2 = np.random.randint(0, es_x.shape[0] - batch_size, batch_size)
            p = random.random()
            if p < 0.3:
                p = random.random()
                if p < 0.5:
                    n3 = np.random.randint(0, en_x.shape[0] - batch_size, batch_size)
                    x = [en_x[n1,:], es_x[n2,:], en_x[n3,:]]
                    y = [en_y[n1,:], es_y[n2,:], en_y[n3,:]]
                else:
                    n3 = np.random.randint(0, es_x.shape[0] - batch_size, batch_size)
                    x = [en_x[n1,:], es_x[n2,:], es_x[n3,:]]
                    y = [en_y[n1,:], es_y[n2,:], es_y[n3,:]]
            else:
                n3 = np.random.randint(0, en_es_x_train.shape[0] - batch_size, batch_size)
                x = [en_x[n1,:], es_x[n2,:], en_es_x_train[n3,:]]
                y = [en_y[n1,:], es_y[n2,:], en_es_y_train[n3,:]]
        elif lang is "en":
            n3 = np.random.randint(0, en_x.shape[0] - batch_size, batch_size)
            x = [en_x[n3,:], en_x[n3,:], en_x[n3,:]]
            y = [en_y[n3,:], en_y[n3,:], en_y[n3,:]]
        elif lang is "es":
            n3 = np.random.randint(0, es_x.shape[0] - batch_size, batch_size)
            x = [es_x[n3,:], es_x[n3,:], es_x[n3,:]]
            y = [es_y[n3,:], es_y[n3,:], es_y[n3,:]]
        elif lang is "unsup":
            n1 = np.random.randint(0, en_x.shape[0] - batch_size, batch_size)
            n2 = np.random.randint(0, es_x.shape[0] - batch_size, batch_size)
            n3 = np.random.randint(0, all_x.shape[0] - batch_size, batch_size)
            x = [en_x[n1,:], es_x[n2,:], all_x[n3,:]]
            y = [en_y[n1,:], es_y[n2,:], all_y[n3,:]]
            
                 
        
#         x = [en_x[n1,:], es_x[n2,:], all_x[n3,:]]
#         y = [en_y[n1,:], es_y[n2,:], all_y[n3,:]]
#         x = [ en_es_x_train[n3,:],  en_es_x_train[n3,:], en_es_x_train[n3,:]]
#         y = [en_es_y_train[n3,:], en_es_y_train[n3,:], en_es_y_train[n3,:]]
        
        yield x, y

In [286]:
gen = train_generator(32, lang="en")
model.fit_generator(
    generator=gen,
    steps_per_epoch=200,
    epochs=20,
    initial_epoch=0,
    validation_data=([en_es_x_train,en_es_x_train,en_es_x_train],[en_es_y_train,en_es_y_train,en_es_y_train]),
    callbacks=[es]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fbfee3a3240>

In [287]:
model.evaluate([en_es_x_test,en_es_x_test,en_es_x_test],[en_es_y_test,en_es_y_test,en_es_y_test])



[3.199548551736804,
 1.0782622342980899,
 1.119319643515358,
 1.0019666784933692,
 0.4979645117849938,
 0.5129714421502529,
 0.5279189461012649]

In [288]:
gen = train_generator(32, lang="es")
model.fit_generator(
    generator=gen,
    steps_per_epoch=200,
    epochs=30,
    initial_epoch=20,
    validation_data=([en_es_x_train,en_es_x_train,en_es_x_train],[en_es_y_train,en_es_y_train,en_es_y_train]),
    callbacks=[es]
)

Epoch 21/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fbfee3a32e8>

In [289]:
model.evaluate([en_es_x_test,en_es_x_test,en_es_x_test],[en_es_y_test,en_es_y_test,en_es_y_test])



[2.8801771301235304,
 0.9730363803236558,
 0.9758537548790165,
 0.9312869851001907,
 0.43307197920455437,
 0.414544276856676,
 0.47206871213959634]

In [290]:
gen = train_generator(32, lang="unsup")
model.fit_generator(
    generator=gen,
    steps_per_epoch=200,
    epochs=50,
    initial_epoch=30,
    validation_data=([en_es_x_train,en_es_x_train,en_es_x_train],[en_es_y_train,en_es_y_train,en_es_y_train]),
    callbacks=[es]
)

Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Restoring model weights from the end of the best epoch
Epoch 00041: early stopping


<keras.callbacks.History at 0x7fbfe64a9b00>

In [291]:
model.evaluate([en_es_x_test,en_es_x_test,en_es_x_test],[en_es_y_test,en_es_y_test,en_es_y_test])



[2.8389667408695813,
 0.9418664416131148,
 0.98001595840563,
 0.9170843568945009,
 0.5008024851416297,
 0.49733431868218675,
 0.5216923524271606]

In [295]:
gen = train_generator(32)
model.fit_generator(
    generator=gen,
    steps_per_epoch=200,
    epochs=100,
    initial_epoch=75,
    validation_data=([en_es_x_test,en_es_x_test,en_es_x_test],[en_es_y_test,en_es_y_test,en_es_y_test])
)

Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100

KeyboardInterrupt: 

In [294]:
model.evaluate([en_es_x_test,en_es_x_test,en_es_x_test],[en_es_y_test,en_es_y_test,en_es_y_test])



[3.1588352623032514,
 0.9760268991767485,
 1.0341843986005612,
 1.1486239554345317,
 0.5573675691789749,
 0.5354144665584284,
 0.6578083537142195]

# ! pip3 install keras-self-attention

# Hindi English Dataset

In [None]:
with open("../data/IIITH_Codemixed.txt") as f:
    lines = f.readlines()[:-1]
print(len(lines))

In [None]:
sents = {"0":-1 , "1" :1 , "2":0}
data = map( lambda x : x.strip().split("\t") , lines )
data = map( lambda x :{'sentiment': sents[x[3]] , 'text': x[1] } , data )
hi_en_data = list(data)

In [None]:
X_train = [x['text'] for x in hi_en_data]
y_train = [x['sentiment'] for x in hi_en_data]

In [None]:
from keras.utils import to_categorical

In [None]:
y_train = to_categorical(y_train,num_classes=3)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
max_len = 32
zero_vector = [0 for _ in range(300)]
def get_x(data_):
    x_  = []
    for sent in data_:
        pred = list(multibpemb.embed(sent))
        if len(pred) >= 32:
            pred = pred[:32]
        else:
            counter = len(pred)
            while counter < max_len:
                pred.append(zero_vector)
                counter = counter + 1
        x_.append(pred)
    return np.array(x_)
X_train =  get_x(X_train)
X_test = get_x(X_test)