In [1]:
import fastText
import math
import linecache
import numpy as np 
from numpy import random
from random import sample
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.layers import *
from keras import *
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
import re
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
set_session(tf.Session(config=config))

In [3]:
ft = fastText.load_model("/home1/zishan/raghav/wiki.hi.bin")

nb_embedding_dims = ft.get_dimension()
nb_sequence_length = 75

In [4]:
def twitter_tokenizer(textline):
    textLine = re.sub(r'http\S+', 'URL', textline)
    textline = re.sub('@[\w_]+', 'USER_MENTION', textline)
    textline = re.sub('\|LBR\|', '', textline)
    textline = re.sub('\.\.\.+', '...', textline)
    textline = re.sub('!!+', '!!', textline)
    textline = re.sub('\?\?+', '??', textline)
    words = re.compile('[\U00010000-\U0010ffff]|[\w-]+|[^ \w\U00010000-\U0010ffff]+', re.UNICODE).findall(textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    # print(words)
    return(words)

In [5]:
word_vectors_ft = {}
def process_features(textline, nb_sequence_length, nb_embedding_dims, tokenize=True):
    if not tokenize:
        words = textline.split()
    else:
        words = twitter_tokenizer(textline)
    features_ft = np.zeros((nb_sequence_length, nb_embedding_dims))
    features_idx = np.zeros(nb_sequence_length)
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors_ft:
            wv = word_vectors_ft[w]
        else:
            wv = ft.get_word_vector(w.lower())
            word_vectors_ft[w] = wv
        features_ft[idx] = wv
        
        idx = idx + 1
    return features_ft

In [6]:
def sequential_generator(filename, 
                         batch_size, 
                         labels2Idx:'dict to make output labels',
                         check:'to check if all lines in file are of same length.To check enter the len of line after splitting it by tabs' = None,
                         tokenize:'specify if using twitter tokenzor to preprocess lines'=False, 
                        ):    
    
    f = open(filename)
    n_labels = len(labels2Idx)
    while True:
        batch_features_ft = np.zeros((batch_size, nb_sequence_length, nb_embedding_dims))
        batch_labels = np.zeros((batch_size, len(labels2Idx)))
        for i in range(batch_size):
            line = f.readline()
            if ("" == line):
                f.seek(0)
                line = f.readline()
            data = line.strip().split('\t')
            if check:
                if len(data)!=check:
                    i-=1
                    continue
            batch_features_ft[i] = process_features(data[0], nb_sequence_length, nb_embedding_dims, tokenize= tokenize)
            if len(labels2Idx)==2:
                batch_labels[i] = to_categorical(0 if data[1] == 'OTHER' else 1, n_labels)
            else:
                batch_labels[i] = to_categorical(labels2Idx[data[1]], n_labels)
        yield ([batch_features_ft], batch_labels)

In [7]:
def train_dev_sentences(filetrain, filedev, check:'to check if lines of file are all same lenght after separating by tab'):
    labels2Idx = {}
    train_lines = [line.strip().split("\t") for line in open(filetrain) if len(line.split('\t'))==check]
    dev_lines = [line.strip().split("\t") for line in open(filedev) if len(line.strip().split('\t'))==check]
    train_sentences = [x[0] for x in train_lines]
    for dataset in [train_lines, dev_lines]:
        for line in dataset:
            label = line[1]
            if label not in labels2Idx.keys():
                labels2Idx[label]= len(labels2Idx)
                
    train_labels = [labels2Idx[x[1]] for x in train_lines]
    dev_sentences = [x[0] for x in dev_lines]
    dev_labels = [labels2Idx[x[1]] for x in dev_lines]
    return (train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx)

In [8]:
train_file = '/home1/zishan/raghav/Data/train_31_aug.txt'
dev_file = '/home1/zishan/raghav/Data/dev_31_aug.txt'
train_sentences, train_labels, dev_sentences, dev_labels, labels2Idx = train_dev_sentences(train_file, dev_file, 2)

In [9]:
print(train_sentences[:10])
print(labels2Idx)
print(len(train_labels))

[': पश्चिम बंगाल के मिदनापुर और सुंदरवन के तटीय इलाकों में रविवार को आए तूफानी चक्रवात की वजह से हजारों लोग बेघर हो गए हैं।', 'ये मछुआरे 60 नावें लेकर ताइवान की सीमा में आए।', 'उनके जरिए सूखा प्रभावित गांवों और कस्बों में पानी की आपूर्ति की जा रही है।', 'लोगों तक दवा व खाद्य सामग्री पहुंचाई जा रही है।', 'नैनीताल की वरिष्ठ पुलिस अधीक्षक स्वीटी अग्रवाल ने बताया कि नैनीताल के जंगलों में आग बुझाने की कार्रवाई प्रभावी ढंग से की जा रही है।', 'वरिष्ठ पुलिस अधीक्षक (कुपवाड़ा) एजाज अहमद ने कहा कि तीनों आतंकवादी शिविर में घुस गए थे।', 'हालांकि अमेरिका ने किसी भी देश से मदद की गुहार नहीं लगाई थी लेकिन दर्जनों देश खुद ही राहत सामग्री और धन के जरिए मदद करने को तत्पर हैं। अमेरिकी विदेश विभाग ने कहा कि अब तक 40 से अधिक देशों और अंतरराष्ट्रीय संगठनों ने उसे मदद देने की पेशकश की है और इस संख्या में लगातार बढ़ोतरी होती जा रही है।', 'राज्य के चंदवली इलाके में २५.८ मिलीमीटर बारिश रिकॅर्ड की गयी जबकि बालासोर में १२.७ मिलीमीटर, भुवनेशवर ६.२ मिलीमीटर, पुरी में ०.२ मिलीमीटर और गोपालपुर में १९.५ मिलीमीटर बारिश 

In [10]:
from collections import Counter
print(Counter(train_labels))
print(Counter(dev_labels))

Counter({0: 382, 1: 193, 2: 153, 6: 147, 5: 123, 3: 76, 4: 64, 7: 45, 8: 21})
Counter({0: 111, 2: 43, 1: 42, 5: 27, 6: 26, 3: 24, 4: 14, 7: 10, 8: 3})


In [11]:
n_labels = len(labels2Idx)

In [106]:
def compile_model_bilstm(no_labels:'total labels for classification'):
    model_input_embedding = Input(shape = (nb_sequence_length, nb_embedding_dims))
    lstm_block = Bidirectional(LSTM(100, dropout = 0.5, return_sequences=True))(model_input_embedding)
    lstm_block = LeakyReLU()(lstm_block)
    model_concatenated = Flatten()(lstm_block)
    model_concatenated = Dense(100)(model_concatenated)
    model_output = Dense(no_labels, activation = "softmax")(model_concatenated)
    new_model = Model(model_input_embedding, model_output)
    new_model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])
    new_model.summary()
    return new_model

In [12]:
def compile_model_bilstm_cnn(no_labels:'total labels for classification'):
    model_input_embedding = Input(shape = (nb_sequence_length, nb_embedding_dims))
    lstm_block = Bidirectional(LSTM(100, dropout = 0.5, return_sequences=True))(model_input_embedding)
    lstm_block = LeakyReLU()(lstm_block)

    filter_sizes = (3, 4, 5)
    conv_blocks = []
    for sz in filter_sizes:
        conv = Conv1D(
            filters = 200,
            kernel_size = sz,
            padding = 'valid',
            strides = 1
        )(lstm_block)
        conv = LeakyReLU()(conv)
        conv = GlobalMaxPooling1D()(conv)
        conv = Dropout(0.5)(conv)
        conv_blocks.append(conv)
    model_concatenated = concatenate([conv_blocks[0], conv_blocks[1], conv_blocks[2]])
    model_concatenated = Dropout(0.8)(model_concatenated)
    model_concatenated = Dense(100)(model_concatenated)
    model_concatenated = LeakyReLU()(model_concatenated)
    model_output = Dense(no_labels, activation = "softmax")(model_concatenated)
    new_model = Model(model_input_embedding, model_output)
    new_model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])
    new_model.summary()
    return new_model

In [13]:
model = compile_model_bilstm_cnn(no_labels = n_labels)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 75, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 75, 200)      320800      input_1[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_1 (LeakyReLU)       (None, 75, 200)      0           bidirectional_1[0][0]            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 73, 200)      120200      leaky_re_lu_1[0][0]              
__________________________________________________________________________________________________
conv1d_2 (

In [15]:
train_file = '/home1/zishan/raghav/Data/train_31_aug.txt'
weights_file ='/home1/zishan/raghav/weights/bilstm_3cnn_modified_dropout=0.5.h5'
log_file = '/home1/zishan/raghav/logs/bilstm_3cnn_modified_dropout=0.5.txt'
batch_size = 16
check_for_generator = 2
labels2Idx = labels2Idx
tokenize = True
samples_per_epoch = len(train_sentences)
steps_per_epoch = math.ceil(samples_per_epoch / batch_size)

In [16]:
max_f1 = 0
for epoch in range(200):
    print("Epoch {}".format(epoch))
    model.fit_generator(sequential_generator(filename = train_file, batch_size = batch_size, check = check_for_generator, 
                                             labels2Idx= labels2Idx,tokenize= tokenize),
                        steps_per_epoch= steps_per_epoch, epochs=1,)

    testset_features = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
    for i in range(len(dev_sentences)):
        testset_features[i] = process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims)
    results = model.predict(testset_features)


    predLabels = results.argmax(axis=-1)
    devLabels = dev_labels
    f1 = f1_score(devLabels, predLabels, average='macro') # offensive is the major class. So other is minor
    r = recall_score(devLabels, predLabels, average='macro')
    p = precision_score(devLabels, predLabels, average='macro')
    a = accuracy_score(devLabels, predLabels)
    
    if f1> max_f1:
        model.save_weights(weights_file)
        with open(log_file,'a+') as f:
            text = str(epoch)+', a: '+str(a) +', f1:' +str(f1) +'\n'
            f.write(text)
        max_f1 = f1

    print(a,f1)

Epoch 0
Epoch 1/1


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.37 0.06001622060016221
Epoch 1
Epoch 1/1
0.37 0.06001622060016221
Epoch 2
Epoch 1/1
0.37 0.06001622060016221
Epoch 3
Epoch 1/1
0.37 0.06001622060016221
Epoch 4
Epoch 1/1
0.38 0.08344602208699572
Epoch 5
Epoch 1/1
0.37333333333333335 0.08876765472510154
Epoch 6
Epoch 1/1
0.38666666666666666 0.10074589471574397
Epoch 7
Epoch 1/1
0.36333333333333334 0.09241959112760147
Epoch 8
Epoch 1/1
0.36666666666666664 0.09503170850688582
Epoch 9
Epoch 1/1
0.36333333333333334 0.09944521233401289
Epoch 10
Epoch 1/1
0.38 0.1153289810547875
Epoch 11
Epoch 1/1
0.38333333333333336 0.12515114982001047
Epoch 12
Epoch 1/1
0.4033333333333333 0.15191477775587625
Epoch 13
Epoch 1/1
0.4033333333333333 0.12348703431505978
Epoch 14
Epoch 1/1
0.4 0.14795670381927747
Epoch 15
Epoch 1/1
0.4 0.14785629523168473
Epoch 16
Epoch 1/1
0.42333333333333334 0.1799681179292367
Epoch 17
Epoch 1/1
0.43 0.18183909106219823
Epoch 18
Epoch 1/1
0.43333333333333335 0.17284729379967473
Epoch 19
Epoch 1/1
0.42 0.1804070117260408
Epoch

0.4166666666666667 0.25978323506136064
Epoch 62
Epoch 1/1
0.4 0.25609686668011633
Epoch 63
Epoch 1/1
0.42333333333333334 0.2674284838048883
Epoch 64
Epoch 1/1
0.4033333333333333 0.2507752124418791
Epoch 65
Epoch 1/1
0.4066666666666667 0.24774159311259158
Epoch 66
Epoch 1/1
0.4066666666666667 0.25472339535298205
Epoch 67
Epoch 1/1
0.41333333333333333 0.26599128108116765
Epoch 68
Epoch 1/1
0.4033333333333333 0.25447299704088955
Epoch 69
Epoch 1/1
0.4033333333333333 0.2444876022015947
Epoch 70
Epoch 1/1
0.38666666666666666 0.2536250830456491
Epoch 71
Epoch 1/1
0.41 0.290613113462059
Epoch 72
Epoch 1/1
0.41333333333333333 0.2786502088587491
Epoch 73
Epoch 1/1
0.4 0.2720823032246057
Epoch 74
Epoch 1/1
0.41 0.2876831774397697
Epoch 75
Epoch 1/1
0.4033333333333333 0.273684007637496
Epoch 76
Epoch 1/1
0.41333333333333333 0.2888864889060739
Epoch 77
Epoch 1/1
0.4 0.2559645519382344
Epoch 78
Epoch 1/1
0.41 0.27176930547467243
Epoch 79
Epoch 1/1
0.39666666666666667 0.2455739716873865
Epoch 80
Epo

0.4 0.27124765495166475
Epoch 121
Epoch 1/1
0.38666666666666666 0.2590104651812358
Epoch 122
Epoch 1/1
0.38333333333333336 0.2694854943132635
Epoch 123
Epoch 1/1
0.39666666666666667 0.2674592022640642
Epoch 124
Epoch 1/1
0.3933333333333333 0.2672198691729445
Epoch 125
Epoch 1/1
0.38666666666666666 0.26302741514509304
Epoch 126
Epoch 1/1
0.37333333333333335 0.2390915248391296
Epoch 127
Epoch 1/1
0.37 0.2367851203290486
Epoch 128
Epoch 1/1
0.38 0.24168874306779317
Epoch 129
Epoch 1/1
0.37666666666666665 0.24450537527444322
Epoch 130
Epoch 1/1
0.36 0.23640825769679566
Epoch 131
Epoch 1/1
0.38666666666666666 0.25429577054761787
Epoch 132
Epoch 1/1
0.38666666666666666 0.26108277740139496
Epoch 133
Epoch 1/1
0.39 0.2776090506997313
Epoch 134
Epoch 1/1
0.37666666666666665 0.2602504377983305
Epoch 135
Epoch 1/1
0.38666666666666666 0.2636668495181299
Epoch 136
Epoch 1/1
0.4 0.26157559397993524
Epoch 137
Epoch 1/1
0.38 0.23861877691779027
Epoch 138
Epoch 1/1
0.38333333333333336 0.247287305992017

0.37666666666666665 0.24098334012759828
Epoch 179
Epoch 1/1
0.36333333333333334 0.23263561329898305
Epoch 180
Epoch 1/1
0.37666666666666665 0.2373642284341411
Epoch 181
Epoch 1/1
0.37 0.2428219976063136
Epoch 182
Epoch 1/1
0.37333333333333335 0.25355761961773404
Epoch 183
Epoch 1/1
0.38 0.24637309627488627
Epoch 184
Epoch 1/1
0.3933333333333333 0.26016650205908515
Epoch 185
Epoch 1/1
0.37 0.23101968659000693
Epoch 186
Epoch 1/1
0.37333333333333335 0.2324273329104247
Epoch 187
Epoch 1/1
0.37666666666666665 0.2352493694033819
Epoch 188
Epoch 1/1
0.36 0.23908837296868402
Epoch 189
Epoch 1/1
0.3933333333333333 0.26960236525635567
Epoch 190
Epoch 1/1
0.36666666666666664 0.25122320100927076
Epoch 191
Epoch 1/1
0.37 0.2499829410832013
Epoch 192
Epoch 1/1
0.37666666666666665 0.24669313419768882
Epoch 193
Epoch 1/1
0.38 0.24193208304309988
Epoch 194
Epoch 1/1
0.36666666666666664 0.25258222671402875
Epoch 195
Epoch 1/1
0.38333333333333336 0.26374334073532235
Epoch 196
Epoch 1/1
0.38 0.2528685281

# transfer learning

In [156]:
def test_model_tl_unfreezing(generator, 
               train_sentences, 
               devLabels, 
               number_of_tests,
               number_of_epochs,
               filename_to_log, 
               labels2Idx,
               filename_to_save_weigths,
               batch_size, 
               unfreezing_strategy: 'list containing a tuple of indices to unfreeze at each step',
               train_file:'filepath for traininig',
               f1_measure:'binary/macro etc', 
               pos_label:'only if binary f1',
               load_model_weights=False,
               model_weights_file:'give filepath as str'=None, 
               tokenize=True,
               nb_sequence_length = nb_sequence_length,
               nb_embedding_dims= nb_embedding_dims, 
               check_for_generator=None):
    
    f = open(filename_to_log, 'w', encoding='utf-8')
    f.close()
   
    total_f1=0
    total_prec=0
    total_acc=0
    total_recall=0
    
    for test_number in range(number_of_tests):
        print("Test %d/%d" %(test_number+1, number_of_tests))
        model = compile_model_bilstm_3cnn(4)

        # transfer learning
        if load_model_weights and model_weights_file:
                model.load_weights(model_weights_file)
                print("removing top layer")
                model.layers.pop()
                output = Dense(len(labels2Idx), activation = 'softmax')(model.layers[-1].output)
                final_model = Model(inputs=model.input, outputs=[output])

        samples_per_epoch = len(train_sentences)
        epochs = number_of_epochs
        batch_size = batch_size
        steps_per_epoch = math.ceil(samples_per_epoch / batch_size)

        max_f1=0
        max_p=0
        max_r=0
        max_a=0
        
        # load pretrained weights
        # model.compile
        # save tmp weights
        # iterate over layers
        #    load tmp weights
        #    iterate over epochs
        #        unfreeze top frozen layer
        #        save best model as tmp weights
        
        
        final_model.save(filename_to_save_weigths)
        
        # layers_to_unfreeze = [18, 16, 3, 1]
        
        for ulayer in unfreezing_strategy:
            print("unfreezing " + final_model.layers[ulayer[0]].name)
            print("---------------------------------------")
            final_model.load_weights(filename_to_save_weigths)            
            for i, layer in enumerate(final_model.layers):
                
                # TF strategy: gradual unfreezing
                #if i >= ulayer:
                #    layer.trainable = True
                #else:
                #    layer.trainable = False
                # 
                ## TF strategy: single
                
                if i >= ulayer[1] and i <= ulayer[0]:
                    layer.trainable = True
                else:
                    layer.trainable = False
                    
                print(str(i) + ' ' + layer.name + ' ' + str(layer.trainable))
            final_model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
        
            for epoch in range(epochs):
                print("Epoch: %d/%d" %(epoch+1, epochs))
                final_model.fit_generator(
                    generator(filename = train_file, batch_size = batch_size, check = check_for_generator, 
                              labels2Idx= labels2Idx,tokenize= tokenize), 
                    steps_per_epoch= steps_per_epoch, epochs=1
                )

                testset_features = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
                for i in range(len(dev_sentences)):
                    testset_features[i] = process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims)
                results = final_model.predict(testset_features)

                predLabels = results.argmax(axis=-1)
                devLabels = devLabels
                f1 = f1_score(devLabels, predLabels, average=f1_measure, pos_label=pos_label) # offensive is the major class. So other is minor
                r = recall_score(devLabels, predLabels, average=f1_measure, pos_label=pos_label)
                p = precision_score(devLabels, predLabels, average=f1_measure, pos_label=pos_label)
                a = accuracy_score(devLabels, predLabels)
                if max_f1 < f1:
                    print("model saved. F1 is %f" %(f1))
                    final_model.save(filename_to_save_weigths)
                    max_f1 = f1
                    max_p = p
                    max_r = r
                    max_a = a
                text = "prec: "+ str(p)+" rec: "+str(r) +" f1: "+str(f1) +" acc: "+str(a)+" \n"
                print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f, Acc: %.3f" % (p, r, f1, a))
        to_write= "prec: "+ str(max_p)+" rec: "+str(max_r) +" f1: "+str(max_f1) +" acc: "+str(max_a)+" \n"
        print(to_write)
        with open(filename_to_log,'a') as f:
            f.write(to_write)
        total_f1+=max_f1
        total_prec+=max_p
        total_acc+=max_a
        total_recall+=max_r    
        print("*****************************************************************************")
    final_text = "avg_prec: " +str(total_prec/number_of_tests)+" total_rec: "+str(total_recall/number_of_tests) +" total_f1: "+str(total_f1/number_of_tests) +" total_acc: "+str(total_acc/number_of_tests)+" \n"
    print(final_text)
    with open(filename_to_log,'a') as f:
        f.write(final_text)

In [157]:
# list of tuples. Every tuple contains range of layers which need to be unfrozen. Rest all are frozen
single_unfreeze_bottom_up = [(18, 18), (17, 16), (15, 3), (2, 1), (18,1)] 
single_unfreeze_top_down = [(18, 18),   (2, 1),(15, 3), (17, 16), (18,1)]
all_unfreeze = [(18,1)]
gradual_unfreezing = [(18,18), (18,16), (18,3), (18,1)]

strings =['suf_bu', 'suf_td','all_unfreeze','gradual_unfreeze']
# strings=['gradual_unfreeze']
unfreeze_strategy = [single_unfreeze_bottom_up, single_unfreeze_top_down, all_unfreeze, gradual_unfreezing]
# unfreeze_strategy = [gradual_unfreezing]

In [160]:
for i in range(len(strings)):
    string = strings[i]
    print("approach: %s" %(string))
    
    generator = sequential_generator
    train_sentences = train_sentences
    devLabels = dev_labels
    number_of_tests = 1
    number_of_epochs = 200
    labels2Id = labels2Idx
    log_file = '/home1/zishan/raghav/logs/tl_bilstm_3cnn_dropout=0.8_' +string+'.txt' 
    print("log file: %s" %(log_file))
    weights_file='/home1/zishan/raghav/weights/tl_bilstm_3cnn_dropout=0.8_'+string+'.h5'
    print("save weights file: %s" %(weights_file))
    batch_size=16
    train_file='/home1/zishan/raghav/Data/train_31_aug.txt'
    f1_measure='macro'
    pos_label=1
    strategy = unfreeze_strategy[i]
    print(strategy)
    load_model_weights=True
    model_weights_file = '/home1/zishan/raghav/weights/pretrain_bilstm_3cnn_dropout=0.8.h5'
    nb_sequence_length = nb_sequence_length
    nb_embedding_dims= nb_embedding_dims
    check_for_generator=2
    
    test_model_tl_unfreezing(generator=generator, 
           train_sentences=train_sentences, 
           devLabels=devLabels, 
           number_of_tests= number_of_tests,
           number_of_epochs=number_of_epochs, 
           filename_to_log=log_file, 
           labels2Idx = labels2Id,
           filename_to_save_weigths=weights_file,
           batch_size=batch_size,
           unfreezing_strategy = strategy,       
           train_file=train_file, 
           f1_measure=f1_measure, 
           pos_label=pos_label, 
           load_model_weights=load_model_weights,
           model_weights_file = model_weights_file, 
           nb_sequence_length=nb_sequence_length, 
           nb_embedding_dims=nb_embedding_dims, 
           check_for_generator= check_for_generator)

approach: suf_bu
log file: /home1/zishan/raghav/logs/tl_bilstm_3cnn_dropout=0.8_suf_bu.txt
save weights file: /home1/zishan/raghav/weights/tl_bilstm_3cnn_dropout=0.8_suf_bu.h5
[(18, 18), (17, 16), (15, 3), (2, 1), (18, 1)]
Test 1/1
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_27 (InputLayer)           (None, 75, 300)      0                                            
__________________________________________________________________________________________________
bidirectional_27 (Bidirectional (None, 75, 200)      320800      input_27[0][0]                   
__________________________________________________________________________________________________
leaky_re_lu_107 (LeakyReLU)     (None, 75, 200)      0           bidirectional_27[0][0]           
___________________________________________________________________________

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


model saved. F1 is 0.035376
Test-Data: Prec: 0.032, Rec: 0.130, F1: 0.035, Acc: 0.053
Epoch: 2/200
Epoch 1/1
Test-Data: Prec: 0.032, Rec: 0.130, F1: 0.035, Acc: 0.053
Epoch: 3/200
Epoch 1/1

KeyboardInterrupt: 

In [None]:
model = compile_model_bilstm_cnn(4)

In [142]:
model.load_weights('/home1/zishan/raghav/weights/pretrain_bilstm_3cnn_dropout=0.8.h5')

In [143]:
model.layers.pop()

<keras.layers.core.Dense at 0x7fdc97539d68>

In [144]:
output = Dense(n_labels, activation='softmax')(model.layers[-1].output)

In [145]:
final_model = Model(inputs=model.input, outputs=[output])

In [146]:
final_model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [147]:
train_file = '/home1/zishan/raghav/Data/train_31_aug.txt'
weights_file ='/home1/zishan/raghav/weights/tl_bilstm_3cnn_dropout=0.8.h5'
log_file = '/home1/zishan/raghav/logs/tl_bilstm_3cnn_dropout=0.8.txt'
batch_size = 16
check_for_generator = 2
labels2Idx = labels2Idx
tokenize = True
samples_per_epoch = len(train_sentences)
steps_per_epoch = math.ceil(samples_per_epoch / batch_size)

In [148]:
max_f1 = 0
for epoch in range(200):
    print("Epoch {}".format(epoch))
    final_model.fit_generator(sequential_generator(filename = train_file, batch_size = batch_size, check = check_for_generator, 
                                             labels2Idx= labels2Idx,tokenize= tokenize),
                        steps_per_epoch= steps_per_epoch, epochs=1,)

    testset_features = np.zeros((len(dev_sentences), nb_sequence_length, nb_embedding_dims))
    for i in range(len(dev_sentences)):
        testset_features[i] = process_features(dev_sentences[i], nb_sequence_length, nb_embedding_dims)
    results = final_model.predict(testset_features)


    #             idx2Label = {0 : "OTHER", 1 : "OFFENSIVE"}
    predLabels = results.argmax(axis=-1)
    devLabels = dev_labels
    f1 = f1_score(devLabels, predLabels, average='macro') # offensive is the major class. So other is minor
    r = recall_score(devLabels, predLabels, average='macro')
    p = precision_score(devLabels, predLabels, average='macro')
    a = accuracy_score(devLabels, predLabels)
    
    if f1> max_f1:
        final_model.save_weights(weights_file)
        with open(log_file,'a+') as f:
            text = str(epoch)+', a: '+str(a) +', f1:' +str(f1) +'\n'
            f.write(text)
        max_f1 = f1

    print(a,f1)

Epoch 0
Epoch 1/1


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.33 0.09311940659968022
Epoch 1
Epoch 1/1
0.35333333333333333 0.08621111023199796
Epoch 2
Epoch 1/1
0.37333333333333335 0.08090934406723879
Epoch 3
Epoch 1/1
0.38 0.09135211160527616
Epoch 4
Epoch 1/1
0.37333333333333335 0.08629416678952283
Epoch 5
Epoch 1/1
0.37 0.08473625140291807
Epoch 6
Epoch 1/1
0.37 0.08374150265681124
Epoch 7
Epoch 1/1
0.37333333333333335 0.09682471146201715
Epoch 8
Epoch 1/1
0.38 0.10147038718467288
Epoch 9
Epoch 1/1
0.37333333333333335 0.0961418235075524
Epoch 10
Epoch 1/1
0.38 0.10663597528099207
Epoch 11
Epoch 1/1
0.38666666666666666 0.10870042959437858
Epoch 12
Epoch 1/1
0.38666666666666666 0.11961052230428801
Epoch 13
Epoch 1/1
0.38666666666666666 0.11458492975734355
Epoch 14
Epoch 1/1
0.42 0.16545342215751077
Epoch 15
Epoch 1/1
0.4 0.1367271421619248
Epoch 16
Epoch 1/1
0.4066666666666667 0.1629477006804906
Epoch 17
Epoch 1/1
0.41333333333333333 0.1519883879576597
Epoch 18
Epoch 1/1
0.41333333333333333 0.16102050977974247
Epoch 19
Epoch 1/1
0.41 0.1634053

0.4033333333333333 0.2754653506127467
Epoch 61
Epoch 1/1
0.4166666666666667 0.2725757673147752
Epoch 62
Epoch 1/1
0.4033333333333333 0.2726733175292269
Epoch 63
Epoch 1/1
0.4166666666666667 0.2658217817972259
Epoch 64
Epoch 1/1
0.39666666666666667 0.31572072934606504
Epoch 65
Epoch 1/1
0.4033333333333333 0.26747882036842274
Epoch 66
Epoch 1/1
0.4166666666666667 0.30850515724795646
Epoch 67
Epoch 1/1
0.41333333333333333 0.32416117595418564
Epoch 68
Epoch 1/1
0.41 0.33447091094222203
Epoch 69
Epoch 1/1
0.41333333333333333 0.3536818184455288
Epoch 70
Epoch 1/1
0.39666666666666667 0.2754531893428058
Epoch 71
Epoch 1/1
0.4033333333333333 0.27607974151452414
Epoch 72
Epoch 1/1
0.39 0.32141914280734873
Epoch 73
Epoch 1/1
0.4066666666666667 0.2706665581829386
Epoch 74
Epoch 1/1
0.4033333333333333 0.3283910262453312
Epoch 75
Epoch 1/1
0.38666666666666666 0.2600920482068023
Epoch 76
Epoch 1/1
0.4066666666666667 0.2735300523819024
Epoch 77
Epoch 1/1
0.4 0.27930967295874126
Epoch 78
Epoch 1/1
0.39

0.4266666666666667 0.28640305580047937
Epoch 179
Epoch 1/1
0.41333333333333333 0.2860225359389417
Epoch 180
Epoch 1/1
0.4033333333333333 0.2802087340488381
Epoch 181
Epoch 1/1
0.41 0.28040802586545527
Epoch 182
Epoch 1/1
0.38666666666666666 0.2605183038877569
Epoch 183
Epoch 1/1
0.4166666666666667 0.3012017866728012
Epoch 184
Epoch 1/1
0.43 0.311414298078663
Epoch 185
Epoch 1/1
0.4166666666666667 0.29076537956645876
Epoch 186
Epoch 1/1
0.4166666666666667 0.2959139869131202
Epoch 187
Epoch 1/1
0.41 0.28333694216815886
Epoch 188
Epoch 1/1
0.41333333333333333 0.27681916650018307
Epoch 189
Epoch 1/1
0.42333333333333334 0.2969148268404489
Epoch 190
Epoch 1/1
0.4033333333333333 0.2830199279535292
Epoch 191
Epoch 1/1
0.41 0.284636900757226
Epoch 192
Epoch 1/1
0.38666666666666666 0.2698461739002329
Epoch 193
Epoch 1/1
0.39 0.2681149560225736
Epoch 194
Epoch 1/1
0.38333333333333336 0.2607064726311873
Epoch 195
Epoch 1/1
0.38666666666666666 0.26949332109458635
Epoch 196
Epoch 1/1
0.39 0.28465197