In [1]:
import fastText, keras
import math
import numpy as np 
from numpy import random
from random import sample
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.layers import *
from keras import *
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import re
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, mean_squared_error

Using TensorFlow backend.


In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
set_session(tf.Session(config=config))

In [3]:
ft = fastText.load_model("/home1/zishan/raghav/emotion/wiki.hi.bin")

nb_embedding_dims = ft.get_dimension()
nb_sequence_length = 75

In [4]:
def twitter_tokenizer(textline):
    textLine = re.sub(r'http\S+', 'URL', textline)
    textline = re.sub('@[\w_]+', 'USER_MENTION', textline)
    textline = re.sub('\|LBR\|', '', textline)
    textline = re.sub('\.\.\.+', '...', textline)
    textline = re.sub('!!+', '!!', textline)
    textline = re.sub('\?\?+', '??', textline)
    words = re.compile('[\U00010000-\U0010ffff]|[\w-]+|[^ \w\U00010000-\U0010ffff]+', re.UNICODE).findall(textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    return(words)

In [5]:
word_vectors_ft = {}
def process_features_crosslingual(textline, nb_sequence_length, nb_embedding_dims, tokenize=True, transmat = None):
    if not tokenize:
        words = textline.split()
    else:
        words = twitter_tokenizer(textline)
    features_ft = np.zeros((nb_sequence_length, nb_embedding_dims))
    features_idx = np.zeros(nb_sequence_length)
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors_ft:
            wv = word_vectors_ft[w]
        else:
            wv = ft.get_word_vector(w.lower())
            wv = np.matmul(wv, transmat) # applying transformation on the word vector to make the vector in same space
            word_vectors_ft[w] = wv
        features_ft[idx] = wv
        
        idx = idx + 1
    return features_ft

In [6]:
word_vectors_ft = {}
def process_features(textline, nb_sequence_length, nb_embedding_dims, tokenize=True):
    if not tokenize:
        words = textline.split()
    else:
        words = twitter_tokenizer(textline)
    features_ft = np.zeros((nb_sequence_length, nb_embedding_dims))
    features_idx = np.zeros(nb_sequence_length)
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors_ft:
            wv = word_vectors_ft[w]
        else:
            wv = ft.get_word_vector(w.lower())
            word_vectors_ft[w] = wv
        features_ft[idx] = wv
        
        idx = idx + 1
    return features_ft

In [7]:
def sequential_generator(filename, 
                         batch_size, 
                         labels2Idx:'dict to make output labels',
                         check:'to check if all lines in file are of same length.To check enter the len of line after splitting it by tabs' = None,
                         tokenize:'specify if using twitter tokenzor to preprocess lines'=False, 
                         crosslingual:'specify if crosslingual training being done'= False, 
                         transmat= None
                        ):    
    
    f = open(filename)
    n_labels = len(labels2Idx)
    while True:
        batch_features_ft = np.zeros((batch_size, nb_sequence_length, nb_embedding_dims))
        batch_labels = np.zeros((batch_size, len(labels2Idx)))
        batch_intensity = np.zeros((batch_size, 1))
        for i in range(batch_size):
            line = f.readline()
            if ("" == line):
                f.seek(0)
                line = f.readline()
            data = line.strip().split('\t')
            if check:
                if len(data)!=check:
                    i-=1
                    continue
            if not crosslingual:
                batch_features_ft[i] = process_features(data[0], nb_sequence_length, nb_embedding_dims, tokenize= tokenize)
            else:
                batch_features_ft[i] = process_features_crosslingual(data[0], nb_sequence_length, nb_embedding_dims, tokenize, transmat)
            batch_labels[i] = to_categorical(labels2Idx[data[1]], n_labels)
            batch_intensity[i] = float(data[2])
        yield ([batch_features_ft], [batch_labels, batch_intensity])

In [8]:
def train_dev_sentences(filetrain, filedev, check:'to check if lines of file are all same lenght after separating by tab'):
    labels2Idx = {}
    train_lines = [line.strip().split("\t") for line in open(filetrain) if len(line.split('\t'))==check]
    dev_lines = [line.strip().split("\t") for line in open(filedev) if len(line.strip().split('\t'))==check]
    for dataset in [train_lines, dev_lines]:
        for line in dataset:
            label = line[1]
            if label not in labels2Idx.keys():
                labels2Idx[label]= len(labels2Idx)
    train_sentences =[]
    train_labels = []
    dev_sentences = []
    dev_labels = []
    try:
        for line_no, x in enumerate(train_lines):
            train_sentences.append(x[0])
            train_labels.append([labels2Idx[x[1]], float(x[2])])
    except Exception as e:
        print(x, line_no, end='\n\n')
    try:
        for line_no, x in enumerate(dev_lines):
            dev_sentences.append(x[0])
            dev_labels.append([labels2Idx[x[1]], float(x[2])])
    except Exception as e:
        print(x, line_no)
    return (train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx)

In [9]:
train_file = '/home1/zishan/raghav/emotion/Data/train_hindi_intensity.txt'
dev_file = '/home1/zishan/raghav/emotion/Data/test_hindi_intensity.txt'
train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx = train_dev_sentences(train_file, dev_file, 3)

In [10]:
n_words = 0
for sentence in train_sentences:
    n_words+=len(sentence)
print(n_words)

614264


In [11]:
from collections import Counter
print(Counter([x[0] for x in train_labels]))
print(labels2Idx)

Counter({1: 3600, 2: 998, 4: 908, 5: 409, 3: 389, 0: 257, 6: 217, 7: 155, 8: 68})
{'SURPRISE': 0, 'SADNESS': 1, 'SYMPATHY/PENSIVENESS': 2, 'JOY': 3, 'NO-EMOTION': 4, 'FEAR/ANXIETY': 5, 'OPTIMISM': 6, 'ANGER': 7, 'DISGUST': 8}


In [12]:
print(train_sentences[:2])
print(train_labels[:10])
print(labels2Idx)
print(len(train_labels))

['इसी दौरान भेड़हा गांव में एनएच-दो पर सामने से आई नीलगाय ने बाइक पर टक्कर मार दी।', 'न्यूनतम -17.7 डिग्री सेल्सियस पारे के साथ लेह में सीजन की सबसे सर्द रात बीती।']
[[0, 1.0], [1, 0.9], [2, 0.4], [1, 0.9], [3, 0.6], [2, 0.6], [1, 0.8], [4, 0.0], [1, 0.7], [1, 0.6]]
{'SURPRISE': 0, 'SADNESS': 1, 'SYMPATHY/PENSIVENESS': 2, 'JOY': 3, 'NO-EMOTION': 4, 'FEAR/ANXIETY': 5, 'OPTIMISM': 6, 'ANGER': 7, 'DISGUST': 8}
7001


In [13]:
n_labels = len(labels2Idx)

In [14]:
def compile_model_bilstm_cnn(no_labels:'total labels for classification'):
    model_input_embedding = Input(shape = (nb_sequence_length, nb_embedding_dims))
    lstm_block = Bidirectional(LSTM(100, dropout = 0.5, return_sequences=True))(model_input_embedding)
    lstm_block = LeakyReLU()(lstm_block)

    filter_sizes = (3, 4, 5)
    conv_blocks = []
    for sz in filter_sizes:
        conv = Conv1D(
            filters = 200,
            kernel_size = sz,
            padding = 'valid',
            strides = 1
        )(lstm_block)
        conv = LeakyReLU()(conv)
        conv = GlobalMaxPooling1D()(conv)
        conv = Dropout(0.5)(conv)
        conv_blocks.append(conv)
    model_concatenated = concatenate([conv_blocks[0], conv_blocks[1], conv_blocks[2]])
    model_concatenated = Dense(100)(model_concatenated)
    model_concatenated = LeakyReLU(name='final_relu')(model_concatenated)
    model_output_classification = Dense(no_labels, activation = "softmax", name='for_classification')(model_concatenated)
#     model_output_intensity = Dense(1, activation = "sigmoid", name='for_intensity')(model_concatenated)
#     new_model = Model(model_input_embedding, [model_output_classification, model_output_intensity])
    new_model = Model(model_input_embedding, model_output_classification)
    
    new_model.compile(loss=['categorical_crossentropy'], optimizer='nadam', metrics = ['accuracy'])
    new_model.summary()
    return new_model

In [33]:
def compile_model_with_intensity(no_old_labels, old_weights_file):
    model = compile_model_bilstm_cnn(4)
    model.load_weights(old_weights_file)
    model.layers.pop()
    output = Dense(len(labels2Idx), activation = 'softmax', name='classification')(model.layers[-1].output)
    model = Model(inputs=model.input, outputs=[output])
    temp = model.get_layer('final_relu').output
    dense_intensity = Dense(1, activation='sigmoid', name='intensity')(temp)
    new_model = Model(inputs=model.input, outputs=[model.output, dense_intensity])
    new_model.compile(loss=['categorical_crossentropy','mean_absolute_error'], optimizer='nadam', metrics=['accuracy'], loss_weights=[0.8,0.2])
    new_model.summary()
    return new_model
    

In [34]:
no_old_labels = len(labels2Idx)
old_weights_file = '/home1/zishan/raghav/emotion/weights/pretrain_semeval_bilstm_3cnn.h5'
new_model = compile_model_with_intensity(no_old_labels=no_old_labels, old_weights_file= old_weights_file)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 75, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_6 (Bidirectional) (None, 75, 200)      320800      input_6[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_21 (LeakyReLU)      (None, 75, 200)      0           bidirectional_6[0][0]            
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 73, 200)      120200      leaky_re_lu_21[0][0]             
__________________________________________________________________________________________________
conv1d_17 

In [21]:
transmat = np.loadtxt('/home1/zishan/raghav/emotion/fastText_multilingual/alignment_matrices/hi.txt')
generator = sequential_generator(filename=train_file, batch_size=len(train_sentences), labels2Idx=labels2Idx, crosslingual=False, transmat=transmat)
x,y = generator.__next__()

In [71]:
print(x[0].shape)
print(y[0].shape)
print(y[1].shape)
print(x[0][0])
print(train_sentences[0])

(7001, 75, 300)
(7001, 9)
(7001, 1)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.18851146 -0.34515083 -0.29542205 ...  0.04218666 -1.04565477
  -0.26295936]
 [-0.3048825   0.02892588 -0.25457713 ...  0.14092876 -0.3737193
   0.24060485]
 [ 0.19710805  0.22378643 -0.23759533 ...  0.04695588 -0.04722783
  -0.29675263]]
इसी दौरान भेड़हा गांव में एनएच-दो पर सामने से आई नीलगाय ने बाइक पर टक्कर मार दी।


In [62]:
def get_logs_and_weights(epochs, filename_to_log, filename_to_save_weights, old_weights_file, no_labels,f1_measure, 
                         unfreezing_strategy, transmat=None, crosslingual=False, tokenize=True, pos_label = None):
    
    f = open(filename_to_log, 'w', encoding='utf-8')
    f.close()
    
    model = compile_model_with_intensity(4, old_weights_file)
    min_mse = math.inf
    max_f1 = 0
    
    model.save_weights(filename_to_save_weights)

    for ulayer in unfreezing_strategy:
            text = "************\nUnfreezing {}\n****************\n".format(model.layers[ulayer[0]].name)
            with open(filename_to_log,'a') as f:
                f.write(text)         
            print(text)
            print("---------------------------------------")
            model.load_weights(filename_to_save_weights)            
            for i, layer in enumerate(model.layers):                
                if ulayer[1] <=i <= ulayer[0]:
                    layer.trainable = True
                else:
                    layer.trainable = False                    
                print(str(i) + ' ' + layer.name + ' ' + str(layer.trainable))
                
            model.compile(loss=['categorical_crossentropy','mean_absolute_error'], optimizer='nadam', metrics=['accuracy'], loss_weights=[0.8,0.2])
            
            
            for epoch in range(epochs):
                print("Epoch {}".format(epoch))
                model.fit(x, y, epochs=1)

                testset_features = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
                for i in range(len(dev_sentences)):
                    if crosslingual:
                        testset_features[i] = process_features_crosslingual(dev_sentences[i], nb_sequence_length, nb_embedding_dims, tokenize, transmat)
                    else:
                        testset_features[i] = process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims, tokenize)
                        
                results = model.predict(testset_features)
                class_pred = results[0].argmax(axis=1)
                intensity_pred =[i.tolist()[0] for i in results[1]]
                true_classes= [label[0] for label in dev_labels]
                true_intensity = [label[1] for label in dev_labels]
                mse = mean_squared_error(true_intensity, intensity_pred)
                correlation_coeff = np.corrcoef(true_intensity, intensity_pred)[0,1]
                f1 = f1_score(true_classes, class_pred, average='macro')
                r = recall_score(true_classes, class_pred,  average=f1_measure, pos_label=pos_label)
                p = precision_score(true_classes, class_pred, average=f1_measure, pos_label=pos_label)
                a = accuracy_score(true_classes, class_pred)
                if max_f1 < f1:
                    print("saved!")
                    max_f1 = f1
                    model.save_weights(filename_to_save_weights)
                    with open(filename_to_log,'a+') as f:
                        text = f'epoch:{epoch} f1:{f1} p:{p} r:{r} mse:{mse} a:{a} corr_coeff:{correlation_coeff}\n'
                        f.write(text)
                to_write = f'f1:{f1} p:{p} r:{r} mse:{mse} a:{a} corr_coeff:{correlation_coeff}'
                print(to_write)


In [74]:
epochs =100
single_unfreeze_bottom_up = [(19, 18), (17, 16), (15, 3), (2, 1), (19,1)] 
single_unfreeze_top_down = [(19, 18),   (2, 1),(15, 3), (17, 16), (19,1)]
all_unfreeze = [(19,1)]
gradual_unfreezing = [(19,18), (19,16), (19,3), (19,1)]
strings =['suf_bu', 'suf_td','all_unfreeze','gradual_unfreeze']
unfreeze_strategy = [single_unfreeze_bottom_up, single_unfreeze_top_down, all_unfreeze, gradual_unfreezing]

In [None]:

for i in range(len(strings)): 
    filename_to_log = f'/home1/zishan/raghav/emotion/logs/tl_semeval_{strings[i]}_with_intensity.txt'
    filename_to_save_weights = f'/home1/zishan/raghav/emotion/weights/tl_semeval_{strings[i]}_with_intensity.h5'
    print(f'log file: {filename_to_log} \n weights_file:{filename_to_save_weights}')
    transmat = np.loadtxt('/home1/zishan/raghav/emotion/fastText_multilingual/alignment_matrices/hi.txt')
    old_weights = '/home1/zishan/raghav/emotion/weights/pretrain_semeval_bilstm_3cnn.h5'
    crosslingual=True
    strategy = unfreeze_strategy[i]
    print(f'strategy:{strategy}')
    tokenize = True
    f1_measure = 'macro'
    no_labels = len(labels2Idx)
    get_logs_and_weights(epochs=epochs, filename_to_log=filename_to_log,
                         filename_to_save_weights = filename_to_save_weights, old_weights_file = old_weights, no_labels=no_labels,
                        f1_measure = f1_measure, unfreezing_strategy = strategy,  transmat = transmat, crosslingual=crosslingual, tokenize = True)

log file: /home1/zishan/raghav/emotion/logs/tl_semeval_suf_bu_with_intensity.txt 
 weights_file:/home1/zishan/raghav/emotion/weights/tl_semeval_suf_bu_with_intensity.h5
strategy:[(19, 18), (17, 16), (15, 3), (2, 1), (19, 1)]
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_20 (InputLayer)           (None, 75, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_20 (Bidirectional (None, 75, 200)      320800      input_20[0][0]                   
__________________________________________________________________________________________________
leaky_re_lu_77 (LeakyReLU)      (None, 75, 200)      0           bidirectional_20[0][0]           
__________________________________________________________________________________

************
Unfreezing intensity
****************

---------------------------------------
0 input_20 False
1 bidirectional_20 False
2 leaky_re_lu_77 False
3 conv1d_58 False
4 conv1d_59 False
5 conv1d_60 False
6 leaky_re_lu_78 False
7 leaky_re_lu_79 False
8 leaky_re_lu_80 False
9 global_max_pooling1d_58 False
10 global_max_pooling1d_59 False
11 global_max_pooling1d_60 False
12 dropout_58 False
13 dropout_59 False
14 dropout_60 False
15 concatenate_20 False
16 dense_23 False
17 final_relu False
18 classification True
19 intensity True
Epoch 0
Epoch 1/1
saved!
f1:0.07830616544044422 p:0.08652391290046707 r:0.10901983218916858 mse:0.0774049796453695 a:0.5108571428571429 corr_coeff:0.0889186319468851
Epoch 1
Epoch 1/1
  32/7001 [..............................] - ETA: 22s - loss: 2.0125 - classification_loss: 2.4189 - intensity_loss: 0.3867 - classification_acc: 0.4062 - intensity_acc: 0.0938

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


saved!
f1:0.07881836261258057 p:0.11408558817885243 r:0.11167683702008643 mse:0.0787603250763668 a:0.5234285714285715 corr_coeff:-0.019270739781862473
Epoch 2
Epoch 1/1
f1:0.07657095297544735 p:0.05841269841269841 r:0.1111111111111111 mse:0.08147026832117178 a:0.5257142857142857 corr_coeff:-0.00201434341854346
Epoch 3
Epoch 1/1
saved!
f1:0.08063125334540788 p:0.14009865203895056 r:0.11299850323100613 mse:0.08390720006515533 a:0.5268571428571428 corr_coeff:-0.00877530125549349
Epoch 4
Epoch 1/1
f1:0.07657095297544735 p:0.05841269841269841 r:0.1111111111111111 mse:0.08287285345826277 a:0.5257142857142857 corr_coeff:-0.02987265199451238
Epoch 5
Epoch 1/1
f1:0.07657095297544735 p:0.05841269841269841 r:0.1111111111111111 mse:0.0829050275882988 a:0.5257142857142857 corr_coeff:-0.016414140875791638
Epoch 6
Epoch 1/1
f1:0.07657095297544735 p:0.05841269841269841 r:0.1111111111111111 mse:0.0759001074239666 a:0.5257142857142857 corr_coeff:0.01681708095125659
Epoch 7
Epoch 1/1
f1:0.076570952975447

# check


In [None]:
def train_dev_sentences_without_intensity(filetrain, filedev, check:'to check if lines of file are all same lenght after separating by tab'):
    labels2Idx = {}
    train_lines = [line.strip().split("\t") for line in open(filetrain) if len(line.split('\t'))==check]
    dev_lines = [line.strip().split("\t") for line in open(filedev) if len(line.strip().split('\t'))==check]
    train_sentences = [x[0] for x in train_lines]
    for dataset in [train_lines, dev_lines]:
        for line in dataset:
            label = line[1]
            if label not in labels2Idx.keys():
                labels2Idx[label]= len(labels2Idx)
                
    train_labels = [labels2Idx[x[1]] for x in train_lines]
    dev_sentences = [x[0] for x in dev_lines]
    dev_labels = [labels2Idx[x[1]] for x in dev_lines]
    return (train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx)

In [None]:
train_file = '/home1/zishan/raghav/Data/train.txt'
dev_file = '/home1/zishan/raghav/Data/dev.txt'
train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx = train_dev_sentences_without_intensity(train_file, dev_file, 2)

In [None]:
print(len(labels2Idx))

In [None]:
model = compile_model_bilstm_cnn(len(labels2Idx))
model.load_weights('/home1/zishan/raghav/weights/temp_gradual_unfreeze.h5')
transmat = np.loadtxt('/home1/zishan/raghav/fastText_multilingual/alignment_matrices/hi.txt')
testset_features = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
for i in range(len(dev_sentences)):
    testset_features[i] = process_features_crosslingual(dev_sentences[i], nb_sequence_length, nb_embedding_dims,True, transmat)
#     testset_features[i] = process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims)

results = model.predict(testset_features)

predLabels = results.argmax(axis=-1)
print(len(dev_labels))
devLabels = dev_labels
f1 = f1_score(devLabels, predLabels, average='macro') # offen
print(f1)