In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
data = pd.read_csv("multi_hot_data_main.csv")

In [0]:
data["len"] = data["para"].str.split().str.len()

In [0]:
sample = data[(data['len'] >= 10) & (data['len'] <=1000)]

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Model, load_model 
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, Conv1D,Dense, Embedding, Input, GRU, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam, RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
from sklearn import metrics
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [0]:
train, test = train_test_split(sample,test_size=0.2,random_state=123)

In [0]:
features_train = train.iloc[:,-2]
features_test = test.iloc[:,-2]

In [0]:
y_train = train.iloc[:,1:-2].values
y_test = test.iloc[:,1:-2].values

In [9]:
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open("drive/My Drive/ content glove glove.42B.300d.txt") as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 1917494 word vectors.


In [0]:
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 300

In [0]:
# convert the sentences (strings) into integers
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(features_train)
sequences_train = tokenizer.texts_to_sequences(features_train)
sequences_test = tokenizer.texts_to_sequences(features_test)

In [12]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

Found 105026 unique tokens.


In [0]:
MAX_SEQUENCE_LENGTH = 250

In [14]:
encoded_train = pad_sequences(sequences_train,maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', encoded_train.shape)
encoded_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', encoded_test.shape)

Shape of data tensor: (19731, 250)
Shape of data tensor: (4933, 250)


In [15]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

Filling pre-trained embeddings...
(20000, 300)


In [0]:
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
 
        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [0]:
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)

In [0]:
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = Bidirectional(LSTM(15, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
output = Dense(26, activation="sigmoid")(x)

model = Model(input_, output)
model.compile(
  loss='binary_crossentropy',
  optimizer=Adam(lr=0.01),
  metrics=['accuracy']
)

In [0]:
# Callbacks
model_3 = ModelCheckpoint('model_bilstm_multilabel_2.h5', save_best_only=True, monitor='val_loss', mode='min')
logdir = "logs/model_bilstm_multilabel/"
tensorboard_callback = TensorBoard(log_dir=logdir)

In [23]:
print('Training model...')
r = model.fit(
  encoded_train,
  y_train,
  batch_size=512,
  epochs=15,
  validation_data=(encoded_test,y_test),callbacks=[tensorboard_callback,model_3]
)

Training model...
Train on 19731 samples, validate on 4933 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [0]:
predictions_4 = model.predict(encoded_test)

In [25]:
for thresh in np.arange(0.3, 0.601, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(y_test, (predictions_4>thresh).astype(int),average="micro")))

F1 score at threshold 0.3 is 0.6240872359069224
F1 score at threshold 0.31 is 0.627346374234341
F1 score at threshold 0.32 is 0.6286886065819747
F1 score at threshold 0.33 is 0.629284120867978
F1 score at threshold 0.34 is 0.6307731826354996
F1 score at threshold 0.35 is 0.6301755116920994
F1 score at threshold 0.36 is 0.6313903236007167
F1 score at threshold 0.37 is 0.6300170794192999
F1 score at threshold 0.38 is 0.6294514995947041
F1 score at threshold 0.39 is 0.6297105406881486
F1 score at threshold 0.4 is 0.6292432521940718
F1 score at threshold 0.41 is 0.6283131521678477
F1 score at threshold 0.42 is 0.6273249915454853
F1 score at threshold 0.43 is 0.625405682400501
F1 score at threshold 0.44 is 0.6242016226480235
F1 score at threshold 0.45 is 0.6223902297179413
F1 score at threshold 0.46 is 0.619961274423517
F1 score at threshold 0.47 is 0.6174902239601848
F1 score at threshold 0.48 is 0.6161573880284638
F1 score at threshold 0.49 is 0.6156074597139236
F1 score at threshold 0.5 

In [27]:
for thresh in np.arange(0.3, 0.601, 0.01):
    thresh = np.round(thresh, 2)
    print("Precision score at threshold {0} is {1}".format(thresh, metrics.precision_score(y_test, (predictions_4>thresh).astype(int),average="micro")))

Precision score at threshold 0.3 is 0.5825683904389712
Precision score at threshold 0.31 is 0.5931807566557683
Precision score at threshold 0.32 is 0.6011477761836442
Precision score at threshold 0.33 is 0.609327442317133
Precision score at threshold 0.34 is 0.6184767277856136
Precision score at threshold 0.35 is 0.6261643552059615
Precision score at threshold 0.36 is 0.634870164281929
Precision score at threshold 0.37 is 0.6417309992388822
Precision score at threshold 0.38 is 0.6495650234218158
Precision score at threshold 0.39 is 0.6572796716451944
Precision score at threshold 0.4 is 0.6644905572394497
Precision score at threshold 0.41 is 0.6716774039608685
Precision score at threshold 0.42 is 0.6784103376813361
Precision score at threshold 0.43 is 0.6844466600199401
Precision score at threshold 0.44 is 0.6918367346938775
Precision score at threshold 0.45 is 0.6989289446185998
Precision score at threshold 0.46 is 0.7040245202558635
Precision score at threshold 0.47 is 0.7100422400872