In [1]:
import fastText, keras
import math
import numpy as np 
from numpy import random
from random import sample
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.layers import *
from keras import *
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import re
from collections import Counter
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, mean_squared_error, classification_report

Using TensorFlow backend.


In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
set_session(tf.Session(config=config))

In [3]:
ft = fastText.load_model("/home1/zishan/raghav/wiki.hi.bin")

nb_embedding_dims = ft.get_dimension()
nb_sequence_length = 75

In [4]:
def twitter_tokenizer(textline):
    textLine = re.sub(r'http\S+', 'URL', textline)
    textline = re.sub('@[\w_]+', 'USER_MENTION', textline)
    textline = re.sub('\|LBR\|', '', textline)
    textline = re.sub('\.\.\.+', '...', textline)
    textline = re.sub('!!+', '!!', textline)
    textline = re.sub('\?\?+', '??', textline)
    words = re.compile('[\U00010000-\U0010ffff]|[\w-]+|[^ \w\U00010000-\U0010ffff]+', re.UNICODE).findall(textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    return(words)

In [5]:
word_vectors_ft = {}
def process_features_crosslingual(textline, nb_sequence_length, nb_embedding_dims, tokenize=True, transmat = None):
    if not tokenize:
        words = textline.split()
    else:
        words = twitter_tokenizer(textline)
    features_ft = np.zeros((nb_sequence_length, nb_embedding_dims))
    features_idx = np.zeros(nb_sequence_length)
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors_ft:
            wv = word_vectors_ft[w]
        else:
            wv = ft.get_word_vector(w.lower())
            wv = np.matmul(wv, transmat) # applying transformation on the word vector to make the vector in same space
            word_vectors_ft[w] = wv
        features_ft[idx] = wv
        
        idx = idx + 1
    return features_ft

In [6]:
word_vectors_ft = {}
def process_features(textline, nb_sequence_length, nb_embedding_dims, tokenize=True):
    if not tokenize:
        words = textline.split()
    else:
        words = twitter_tokenizer(textline)
    features_ft = np.zeros((nb_sequence_length, nb_embedding_dims))
    features_idx = np.zeros(nb_sequence_length)
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors_ft:
            wv = word_vectors_ft[w]
        else:
            wv = ft.get_word_vector(w.lower())
            word_vectors_ft[w] = wv
        features_ft[idx] = wv
        
        idx = idx + 1
    return features_ft

In [7]:
def compile_model_bilstm_cnn(no_labels:'total labels for classification'):
    model_input_embedding = Input(shape = (nb_sequence_length, nb_embedding_dims))
    lstm_block = Bidirectional(LSTM(100, dropout = 0.5, return_sequences=True))(model_input_embedding)
    lstm_block = LeakyReLU()(lstm_block)

    filter_sizes = (3, 4, 5)
    conv_blocks = []
    for sz in filter_sizes:
        conv = Conv1D(
            filters = 200,
            kernel_size = sz,
            padding = 'valid',
            strides = 1
        )(lstm_block)
        conv = LeakyReLU()(conv)
        conv = GlobalMaxPooling1D()(conv)
        conv = Dropout(0.5)(conv)
        conv_blocks.append(conv)
    model_concatenated = concatenate([conv_blocks[0], conv_blocks[1], conv_blocks[2]])
    model_concatenated = Dense(100)(model_concatenated)
    model_concatenated = LeakyReLU(name='final_relu')(model_concatenated)
    model_output_classification = Dense(no_labels, activation = "softmax", name='for_classification')(model_concatenated)
#     model_output_intensity = Dense(1, activation = "sigmoid", name='for_intensity')(model_concatenated)
#     new_model = Model(model_input_embedding, [model_output_classification, model_output_intensity])
    new_model = Model(model_input_embedding, model_output_classification)
    
    new_model.compile(loss=['categorical_crossentropy'], optimizer='nadam', metrics = ['accuracy'])
    new_model.summary()
    return new_model

In [8]:
def train_dev_sentences_without_intensity(filetrain, filedev, check:'to check if lines of file are all same lenght after separating by tab'):
    labels2Idx = {}
    train_lines = [line.strip().split("\t") for line in open(filetrain) if len(line.split('\t'))==check]
    dev_lines = [line.strip().split("\t") for line in open(filedev) if len(line.strip().split('\t'))==check]
    train_sentences = [x[0] for x in train_lines]
    for dataset in [train_lines, dev_lines]:
        for line in dataset:
            label = line[1]
            if label not in labels2Idx.keys():
                labels2Idx[label]= len(labels2Idx)
                
    train_labels = [labels2Idx[x[1]] for x in train_lines]
    dev_sentences = [x[0] for x in dev_lines]
    dev_labels = [labels2Idx[x[1]] for x in dev_lines]
    return (train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx)

In [9]:
train_file = '/home1/zishan/raghav/Data/train_total.txt'
dev_file = '/home1/zishan/raghav/Data/test_total.txt'
train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx = train_dev_sentences_without_intensity(train_file, dev_file, 3)

In [10]:
print(train_sentences[:2])
print(train_labels[:2])

['इसके बाद से यहां पर भारतीय सेना तैनात रहती है।', 'सरकारी मीडिया ने सोमवार को बताया कि बाढ़ के भंवर में फंस कर फुजिआन प्रांत में ४३, हुनान में ७८ और गुआंगडांग में ३३ लोगों की मौत हो गई।']
[0, 0]


In [11]:
print(len(labels2Idx))
print(labels2Idx)

9
{'SADNESS': 0, 'NO-EMOTION': 1, 'SYMPATHY/PENSIVENESS': 2, 'OPTIMISM': 3, 'JOY': 4, 'ANGER': 5, 'DISGUST': 6, 'FEAR/ANXIETY': 7, 'SURPRISE': 8}


In [23]:
word_vectors_ft = {}
model = compile_model_bilstm_cnn(len(labels2Idx))
model.load_weights('/home1/zishan/raghav/weights/tl_semeval_suf_bu_3.h5')
transmat = np.loadtxt('/home1/zishan/raghav/fastText_multilingual/alignment_matrices/hi.txt')
testset_features = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
for i in range(len(dev_sentences)):
    testset_features[i] = process_features_crosslingual(dev_sentences[i], nb_sequence_length, nb_embedding_dims,True, transmat)
#     testset_features[i] = process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims)

results = model.predict(testset_features)

predLabels = results.argmax(axis=-1)
print(len(dev_labels))
devLabels = dev_labels

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 75, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 75, 200)      320800      input_2[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_5 (LeakyReLU)       (None, 75, 200)      0           bidirectional_2[0][0]            
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 73, 200)      120200      leaky_re_lu_5[0][0]              
__________________________________________________________________________________________________
conv1d_5 (

In [24]:
f1_macro = f1_score(devLabels, predLabels, average='macro') # offen
f1_micro = f1_score(devLabels, predLabels, average='micro')
report = classification_report(devLabels, predLabels)
r = recall_score(devLabels, predLabels, average='macro')
p = precision_score(devLabels, predLabels, average='macro')
a = accuracy_score(devLabels, predLabels)
print(labels2Idx)
# print(Counter(devLabels))
print(classification_report(devLabels, predLabels))
print("f1_macro:{0} f1_micro:{1} p:{2} r:{3} a:{4}".format(f1_macro, f1_micro, p, r, a))

{'SADNESS': 0, 'NO-EMOTION': 1, 'SYMPATHY/PENSIVENESS': 2, 'OPTIMISM': 3, 'JOY': 4, 'ANGER': 5, 'DISGUST': 6, 'FEAR/ANXIETY': 7, 'SURPRISE': 8}
             precision    recall  f1-score   support

          0       0.76      0.92      0.83       374
          1       0.74      0.51      0.60        61
          2       0.64      0.47      0.54        93
          3       0.52      0.53      0.53        43
          4       0.75      0.41      0.53        37
          5       0.67      0.67      0.67         6
          6       0.33      0.25      0.29        16
          7       0.61      0.48      0.54        29
          8       1.00      0.33      0.50         9

avg / total       0.71      0.72      0.70       668

f1_macro:0.5582868201208393 f1_micro:0.7200598802395209 p:0.6690132382492215 r:0.5079416719302112 a:0.7200598802395209


In [None]:
conf_mat = confusion_matrix(devLabels, predLabels)
print(conf_mat)

In [None]:
log_file = '/home1/zishan/raghav/logs/tl_crowdflower_all_unfreeze_4.txt'

In [None]:
with open(log_file,'a+') as f:
    f.write("********************************\n")
    text = "p:{0}\t r:{1}\t f1_micro:{2}\t f1_macro:{3}\t a:{4}\t\n\nLabels2Idx:{5}\n\nReport:{6}\n)".format(p,r,f1_micro, f1_macro, a, labels2Idx, report)
    f.write(text)

# get file for error analysis


In [18]:
idx2labels = {value:key for key, value in labels2Idx.items()}

In [19]:
print(dev_sentences[:2])
print(dev_labels[:2])
print(idx2labels[2])

['यह जानकारी तेलंगाना के मंत्री केटी रामा राव टीवी चैनल एनडीटीवी से बात करते हुए दी।', 'बक्तोर क्षेत्र में सोमवार देर रात सेना की नैना पोस्ट हिमस्खलन की चपेट में आ गई।']
[6, 0]
SYMPATHY/PENSIVENESS


In [25]:
file_name = f'/home1/zishan/raghav/logs/error_analysis_semeval.txt'
with open(file_name, 'w') as f:
    f.write(f'sentence\tpredicted_label\tTrue_label\n')

for i,line in enumerate(dev_sentences):
    with open(file_name ,'a') as f:
        string = f'{line}\t{idx2labels[predLabels[i]]}\t{idx2labels[devLabels[i]]}\n'
        f.write(string)