In [1]:
import fastText
import math
import linecache
import numpy as np 
from numpy import random
from random import sample
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.layers import *
from keras import *
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import re
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
set_session(tf.Session(config=config))

In [3]:
ft = fastText.load_model("/home1/zishan/raghav/wiki.hi.bin")

nb_embedding_dims = ft.get_dimension()
nb_sequence_length = 75

In [4]:
def twitter_tokenizer(textline):
    textLine = re.sub(r'http\S+', 'URL', textline)
    textline = re.sub('@[\w_]+', 'USER_MENTION', textline)
    textline = re.sub('\|LBR\|', '', textline)
    textline = re.sub('\.\.\.+', '...', textline)
    textline = re.sub('!!+', '!!', textline)
    textline = re.sub('\?\?+', '??', textline)
    words = re.compile('[\U00010000-\U0010ffff]|[\w-]+|[^ \w\U00010000-\U0010ffff]+', re.UNICODE).findall(textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    # print(words)
    return(words)

In [5]:
word_vectors_ft = {}
def process_features(textline, nb_sequence_length, nb_embedding_dims, tokenize=True):
    if not tokenize:
        words = textline.split()
    else:
        words = twitter_tokenizer(textline)
    features_ft = np.zeros((nb_sequence_length, nb_embedding_dims))
    features_idx = np.zeros(nb_sequence_length)
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors_ft:
            wv = word_vectors_ft[w]
        else:
            wv = ft.get_word_vector(w.lower())
            word_vectors_ft[w] = wv
        features_ft[idx] = wv
        
        idx = idx + 1
    return features_ft

In [6]:
def sequential_generator(filename, 
                         batch_size, 
                         labels2Idx:'dict to make output labels',
                         check:'to check if all lines in file are of same length.To check enter the len of line after splitting it by tabs' = None,
                         tokenize:'specify if using twitter tokenzor to preprocess lines'=False, 
                        ):    
    
    f = open(filename)
    n_labels = len(labels2Idx)
    while True:
        batch_features_ft = np.zeros((batch_size, nb_sequence_length, nb_embedding_dims))
        batch_labels = np.zeros((batch_size, len(labels2Idx)))
        for i in range(batch_size):
            line = f.readline()
            if ("" == line):
                f.seek(0)
                line = f.readline()
            data = line.strip().split('\t')
            if check:
                if len(data)!=check:
                    i-=1
                    continue
            batch_features_ft[i] = process_features(data[0], nb_sequence_length, nb_embedding_dims, tokenize= tokenize)
            if len(labels2Idx)==2:
                batch_labels[i] = to_categorical(0 if data[1] == 'OTHER' else 1, n_labels)
            else:
                batch_labels[i] = to_categorical(labels2Idx[data[1]], n_labels)
        yield ([batch_features_ft], batch_labels)

In [7]:
def train_dev_sentences(filetrain, filedev, check:'to check if lines of file are all same lenght after separating by tab'):
    labels2Idx = {}
    train_lines = [line.strip().split("\t") for line in open(filetrain) if len(line.split('\t'))==check]
    dev_lines = [line.strip().split("\t") for line in open(filedev) if len(line.strip().split('\t'))==check]
    train_sentences = [x[0] for x in train_lines]
    for dataset in [train_lines, dev_lines]:
        for line in dataset:
            label = line[1]
            if label not in labels2Idx.keys():
                labels2Idx[label]= len(labels2Idx)
                
    train_labels = [labels2Idx[x[1]] for x in train_lines]
    dev_sentences = [x[0] for x in dev_lines]
    dev_labels = [labels2Idx[x[1]] for x in dev_lines]
    return (train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx)

In [8]:
train_file = '/home1/zishan/raghav/Data/train_31_aug.txt'
dev_file = '/home1/zishan/raghav/Data/dev_31_aug.txt'
train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx = train_dev_sentences(train_file, dev_file, 2)

In [9]:
print(train_sentences[:10])
print(labels2Idx)
print(len(train_labels))

[': पश्चिम बंगाल के मिदनापुर और सुंदरवन के तटीय इलाकों में रविवार को आए तूफानी चक्रवात की वजह से हजारों लोग बेघर हो गए हैं।', 'ये मछुआरे 60 नावें लेकर ताइवान की सीमा में आए।', 'उनके जरिए सूखा प्रभावित गांवों और कस्बों में पानी की आपूर्ति की जा रही है।', 'लोगों तक दवा व खाद्य सामग्री पहुंचाई जा रही है।', 'नैनीताल की वरिष्ठ पुलिस अधीक्षक स्वीटी अग्रवाल ने बताया कि नैनीताल के जंगलों में आग बुझाने की कार्रवाई प्रभावी ढंग से की जा रही है।', 'वरिष्ठ पुलिस अधीक्षक (कुपवाड़ा) एजाज अहमद ने कहा कि तीनों आतंकवादी शिविर में घुस गए थे।', 'हालांकि अमेरिका ने किसी भी देश से मदद की गुहार नहीं लगाई थी लेकिन दर्जनों देश खुद ही राहत सामग्री और धन के जरिए मदद करने को तत्पर हैं। अमेरिकी विदेश विभाग ने कहा कि अब तक 40 से अधिक देशों और अंतरराष्ट्रीय संगठनों ने उसे मदद देने की पेशकश की है और इस संख्या में लगातार बढ़ोतरी होती जा रही है।', 'राज्य के चंदवली इलाके में २५.८ मिलीमीटर बारिश रिकॅर्ड की गयी जबकि बालासोर में १२.७ मिलीमीटर, भुवनेशवर ६.२ मिलीमीटर, पुरी में ०.२ मिलीमीटर और गोपालपुर में १९.५ मिलीमीटर बारिश 

In [10]:
from collections import Counter
print(Counter(train_labels))
print(Counter(dev_labels))

Counter({0: 382, 1: 193, 2: 153, 6: 147, 5: 123, 3: 76, 4: 64, 7: 45, 8: 21})
Counter({0: 111, 2: 43, 1: 42, 5: 27, 6: 26, 3: 24, 4: 14, 7: 10, 8: 3})


In [11]:
n_labels = len(labels2Idx)

In [12]:
def compile_model_bilstm(no_labels:'total labels for classification'):
    model_input_embedding = Input(shape = (nb_sequence_length, nb_embedding_dims))
    lstm_block = Bidirectional(LSTM(100, dropout = 0.5, return_sequences=True))(model_input_embedding)
    lstm_block = LeakyReLU()(lstm_block)
    model_concatenated = Flatten()(lstm_block)
    model_concatenated = Dense(100)(model_concatenated)
    model_output = Dense(no_labels, activation = "softmax")(model_concatenated)
    new_model = Model(model_input_embedding, model_output)
    new_model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])
    new_model.summary()
    return new_model

In [107]:
def compile_model_bilstm_cnn(no_labels:'total labels for classification'):
    model_input_embedding = Input(shape = (nb_sequence_length, nb_embedding_dims))
    lstm_block = Bidirectional(LSTM(100, dropout = 0.5, return_sequences=True))(model_input_embedding)
    lstm_block = LeakyReLU()(lstm_block)

    filter_sizes = (3, 4, 5)
    conv_blocks = []
    for sz in filter_sizes:
        conv = Conv1D(
            filters = 200,
            kernel_size = sz,
            padding = 'valid',
            strides = 1
        )(lstm_block)
        conv = LeakyReLU()(conv)
        conv = GlobalMaxPooling1D()(conv)
        conv = Dropout(0.5)(conv)
        conv_blocks.append(conv)
    model_concatenated = concatenate([conv_blocks[0], conv_blocks[1], conv_blocks[2]])
    model_concatenated = Dropout(0.8)(model_concatenated)
    model_concatenated = Dense(100)(model_concatenated)
    model_concatenated = LeakyReLU()(model_concatenated)
    model_output = Dense(no_labels, activation = "softmax")(model_concatenated)
    new_model = Model(model_input_embedding, model_output)
    new_model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])
    new_model.summary()
    return new_model

In [13]:
model = compile_model_bilstm(no_labels = n_labels)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 75, 300)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 75, 200)           320800    
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 75, 200)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 15000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               1500100   
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 909       
Total params: 1,821,809
Trainable params: 1,821,809
Non-trainable params: 0
_________________________________________________________________


In [15]:
train_file = '/home1/zishan/raghav/Data/train_31_aug.txt'
weights_file ='/home1/zishan/raghav/weights/bilstm.h5'
log_file = '/home1/zishan/raghav/logs/bilstm.txt'
batch_size = 16
check_for_generator = 2
labels2Idx = labels2Idx
tokenize = True
samples_per_epoch = len(train_sentences)
steps_per_epoch = math.ceil(samples_per_epoch / batch_size)

In [16]:
max_f1 = 0
for epoch in range(200):
    print("Epoch {}".format(epoch))
    model.fit_generator(sequential_generator(filename = train_file, batch_size = batch_size, check = check_for_generator, 
                                             labels2Idx= labels2Idx,tokenize= tokenize),
                        steps_per_epoch= steps_per_epoch, epochs=1,)

    testset_features = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
    for i in range(len(dev_sentences)):
        testset_features[i] = process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims)
    results = model.predict(testset_features)


    predLabels = results.argmax(axis=-1)
    devLabels = dev_labels
    f1 = f1_score(devLabels, predLabels, average='macro') # offensive is the major class. So other is minor
    r = recall_score(devLabels, predLabels, average='macro')
    p = precision_score(devLabels, predLabels, average='macro')
    a = accuracy_score(devLabels, predLabels)
    
    if f1> max_f1:
        model.save_weights(weights_file)
        with open(log_file,'a+') as f:
            text = str(epoch)+', a: '+str(a) +', f1:' +str(f1) +'\n'
            f.write(text)
        max_f1 = f1

    print(a,f1)

Epoch 0
Epoch 1/1


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.37666666666666665 0.08509590221975924
Epoch 1
Epoch 1/1
0.3233333333333333 0.13001414060976668
Epoch 2
Epoch 1/1
0.3466666666666667 0.19068468028337568
Epoch 3
Epoch 1/1
0.34 0.19040043227516742
Epoch 4
Epoch 1/1
0.3333333333333333 0.18942177128938356
Epoch 5
Epoch 1/1
0.35 0.1746347223277593
Epoch 6
Epoch 1/1
0.33666666666666667 0.1756491557196355
Epoch 7
Epoch 1/1
0.3233333333333333 0.17145000640692964
Epoch 8
Epoch 1/1
0.35333333333333333 0.177247834092365
Epoch 9
Epoch 1/1
0.36 0.19471305215636114
Epoch 10
Epoch 1/1
0.34 0.1854324494949495
Epoch 11
Epoch 1/1
0.34 0.1838408380822133
Epoch 12
Epoch 1/1
0.3433333333333333 0.19109495333404256
Epoch 13
Epoch 1/1
0.35 0.19527252734714884
Epoch 14
Epoch 1/1
0.35333333333333333 0.19971004281515448
Epoch 15
Epoch 1/1
0.34 0.1826724775502971
Epoch 16
Epoch 1/1
0.3433333333333333 0.17752789240392133
Epoch 17
Epoch 1/1
0.3433333333333333 0.1842264353169254
Epoch 18
Epoch 1/1
0.33666666666666667 0.1802366755182179
Epoch 19
Epoch 1/1
0.3533333

0.34 0.17962573019088723
Epoch 61
Epoch 1/1
0.33 0.17903819254393966
Epoch 62
Epoch 1/1
0.33666666666666667 0.18224147317639786
Epoch 63
Epoch 1/1
0.34 0.18444518048097802
Epoch 64
Epoch 1/1
0.33 0.17954956225436117
Epoch 65
Epoch 1/1
0.34 0.18004968091378326
Epoch 66
Epoch 1/1
0.3433333333333333 0.18294168713396525
Epoch 67
Epoch 1/1
0.3433333333333333 0.1773959377080143
Epoch 68
Epoch 1/1
0.3466666666666667 0.18377867464152503
Epoch 69
Epoch 1/1
0.33666666666666667 0.17168177661444747
Epoch 70
Epoch 1/1
0.3466666666666667 0.18748691198219042
Epoch 71
Epoch 1/1
0.3466666666666667 0.18640197194471916
Epoch 72
Epoch 1/1
0.3433333333333333 0.1886238647503068
Epoch 73
Epoch 1/1
0.3433333333333333 0.18834780133191895
Epoch 74
Epoch 1/1
0.34 0.18859874076238642
Epoch 75
Epoch 1/1
0.35 0.18452723785560002
Epoch 76
Epoch 1/1
0.34 0.18078390953970708
Epoch 77
Epoch 1/1
0.3466666666666667 0.1902826776922894
Epoch 78
Epoch 1/1
0.33 0.18488889752374046
Epoch 79
Epoch 1/1
0.33666666666666667 0.176

0.31666666666666665 0.16675982479981008
Epoch 120
Epoch 1/1
0.32666666666666666 0.16857850336585345
Epoch 121
Epoch 1/1
0.3233333333333333 0.163026009742053
Epoch 122
Epoch 1/1
0.31666666666666665 0.16746643035341585
Epoch 123
Epoch 1/1
0.33 0.16510138264601243
Epoch 124
Epoch 1/1
0.33 0.16117179394332953
Epoch 125
Epoch 1/1
0.32666666666666666 0.16176014601592192
Epoch 126
Epoch 1/1
0.32 0.16493890584617285
Epoch 127
Epoch 1/1
0.32 0.16532163598987848
Epoch 128
Epoch 1/1
0.31 0.1592092977541894
Epoch 129
Epoch 1/1
0.31333333333333335 0.165237266940207
Epoch 130
Epoch 1/1
0.30666666666666664 0.15581899465547264
Epoch 131
Epoch 1/1
0.31333333333333335 0.1666240593141178
Epoch 132
Epoch 1/1
0.31 0.16405557854148628
Epoch 133
Epoch 1/1
0.31 0.1647120224708474
Epoch 134
Epoch 1/1
0.30666666666666664 0.15724807365979263
Epoch 135
Epoch 1/1
0.2966666666666667 0.1581276521594508
Epoch 136
Epoch 1/1
0.3233333333333333 0.1605911237490185
Epoch 137
Epoch 1/1
0.32 0.16485984947584253
Epoch 138
Ep

0.31333333333333335 0.1604343990834474
Epoch 179
Epoch 1/1
0.31333333333333335 0.17370166500662185
Epoch 180
Epoch 1/1
0.31666666666666665 0.1621366403705477
Epoch 181
Epoch 1/1
0.31333333333333335 0.1634268451513917
Epoch 182
Epoch 1/1
0.31666666666666665 0.16782519588934253
Epoch 183
Epoch 1/1
0.32 0.16678661490059601
Epoch 184
Epoch 1/1
0.32 0.17346183609731736
Epoch 185
Epoch 1/1
0.32666666666666666 0.17590923516443915
Epoch 186
Epoch 1/1
0.33 0.17216046889505845
Epoch 187
Epoch 1/1
0.32 0.16140747343069067
Epoch 188
Epoch 1/1
0.29333333333333333 0.15431847428055714
Epoch 189
Epoch 1/1
0.30666666666666664 0.16430577334391597
Epoch 190
Epoch 1/1
0.30333333333333334 0.1625765346583139
Epoch 191
Epoch 1/1
0.32666666666666666 0.1667185888884002
Epoch 192
Epoch 1/1
0.31 0.1662528330858009
Epoch 193
Epoch 1/1
0.31666666666666665 0.16885837733025924
Epoch 194
Epoch 1/1
0.31666666666666665 0.16976978142198576
Epoch 195
Epoch 1/1
0.31666666666666665 0.16000271128496035
Epoch 196
Epoch 1/1
0

# transfer learning

In [141]:
model = compile_model_bilstm_cnn(4)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           (None, 75, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_23 (Bidirectional (None, 75, 200)      320800      input_23[0][0]                   
__________________________________________________________________________________________________
leaky_re_lu_87 (LeakyReLU)      (None, 75, 200)      0           bidirectional_23[0][0]           
__________________________________________________________________________________________________
conv1d_49 (Conv1D)              (None, 73, 200)      120200      leaky_re_lu_87[0][0]             
__________________________________________________________________________________________________
conv1d_50 

In [142]:
model.load_weights('/home1/zishan/raghav/weights/pretrain_bilstm_3cnn_dropout=0.8.h5')

In [143]:
model.layers.pop()

<keras.layers.core.Dense at 0x7fdc97539d68>

In [144]:
output = Dense(n_labels, activation='softmax')(model.layers[-1].output)

In [145]:
final_model = Model(inputs=model.input, outputs=[output])

In [146]:
final_model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [147]:
train_file = '/home1/zishan/raghav/Data/train_31_aug.txt'
weights_file ='/home1/zishan/raghav/weights/tl_bilstm_3cnn_dropout=0.8.h5'
log_file = '/home1/zishan/raghav/logs/tl_bilstm_3cnn_dropout=0.8.txt'
batch_size = 16
check_for_generator = 2
labels2Idx = labels2Idx
tokenize = True
samples_per_epoch = len(train_sentences)
steps_per_epoch = math.ceil(samples_per_epoch / batch_size)

In [None]:
max_f1 = 0
for epoch in range(200):
    print("Epoch {}".format(epoch))
    final_model.fit_generator(sequential_generator(filename = train_file, batch_size = batch_size, check = check_for_generator, 
                                             labels2Idx= labels2Idx,tokenize= tokenize),
                        steps_per_epoch= steps_per_epoch, epochs=1,)

    testset_features = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
    for i in range(len(dev_sentences)):
        testset_features[i] = process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims)
    results = final_model.predict(testset_features)


    #             idx2Label = {0 : "OTHER", 1 : "OFFENSIVE"}
    predLabels = results.argmax(axis=-1)
    devLabels = dev_labels
    f1 = f1_score(devLabels, predLabels, average='macro') # offensive is the major class. So other is minor
    r = recall_score(devLabels, predLabels, average='macro')
    p = precision_score(devLabels, predLabels, average='macro')
    a = accuracy_score(devLabels, predLabels)
    
    if f1> max_f1:
        final_model.save_weights(weights_file)
        with open(log_file,'a+') as f:
            text = str(epoch)+', a: '+str(a) +', f1:' +str(f1) +'\n'
            f.write(text)
        max_f1 = f1

    print(a,f1)

Epoch 0
Epoch 1/1


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.33 0.09311940659968022
Epoch 1
Epoch 1/1
0.35333333333333333 0.08621111023199796
Epoch 2
Epoch 1/1
0.37333333333333335 0.08090934406723879
Epoch 3
Epoch 1/1
0.38 0.09135211160527616
Epoch 4
Epoch 1/1
0.37333333333333335 0.08629416678952283
Epoch 5
Epoch 1/1