In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# **Importing Dependencies**

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Dense, Embedding, Input,LSTM, Bidirectional, Dropout,Conv1D, GlobalMaxPool1D
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelBinarizer
from keras.optimizers import Adam, RMSprop, SGD

# **Attention Mechanism**

In [0]:
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
 
        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

# **Data Loading / Modelling**

In [5]:
data = pd.read_csv("drive/My Drive/MI22_cleaned2.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,para,label
0,0,18 it is further noted by us that identical is...,royalty
1,1,8 41for taxation purposes the term royalty has...,royalty
2,2,51it is found that all the issues raised by th...,royalty
3,3,the phrase computer software is commonly used ...,royalty
4,4,9 we heard rival submissions and perused the m...,royalty


In [0]:
data.drop("Unnamed: 0",axis=1,inplace=True)

In [0]:
train, test = train_test_split(data,test_size=0.2,random_state=123)

In [0]:
# Preparaing Targets
encoder = LabelBinarizer()
encoder.fit(train["label"].values)
y_train = encoder.transform(train["label"].values)
y_test = encoder.transform(test["label"].values)

In [11]:
# Text Featurization
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open("drive/My Drive/ content glove glove.42B.300d.txt") as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 1917494 word vectors.


In [0]:
# some configuration
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 300

In [0]:
# convert the sentences (strings) into integers
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train["para"])
sequences_train = tokenizer.texts_to_sequences(train["para"])
sequences_test = tokenizer.texts_to_sequences(test["para"])

In [14]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

Found 324477 unique tokens.


In [0]:
MAX_SEQUENCE_LENGTH = 600

In [38]:
encoded_train = pad_sequences(sequences_train,maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', encoded_train.shape)
encoded_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', encoded_test.shape)

Shape of data tensor: (34311, 600)
Shape of data tensor: (8578, 600)


In [39]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

Filling pre-trained embeddings...
(20000, 300)


# **Model Building // Training**

In [0]:
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=600,
  trainable=False
)
input_ = Input(shape=(600,))
x = embedding_layer(input_)
e = Bidirectional(LSTM(50, return_sequences=True))(x)
e = Conv1D(50, (1), activation='relu')(e)
e = Conv1D(50, (1), activation='relu')(e)
e = Bidirectional(LSTM(50, return_sequences=True))(e)
e = Conv1D(50, (1), activation='relu')(e)
e = Conv1D(50, (1), activation='relu')(e)
e = Attention(600)(e)
output = Dense(19, activation="softmax")(e)
model = Model(input_,output)

In [47]:
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 600)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 600, 300)          6000000   
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 600, 100)          140400    
_________________________________________________________________
conv1d_25 (Conv1D)           (None, 600, 50)           5050      
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 600, 50)           2550      
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 600, 100)          40400     
_________________________________________________________________
conv1d_27 (Conv1D)           (None, 600, 50)           5050

In [0]:
# Callbacks
model_2 = ModelCheckpoint('model_2.h5', save_best_only=True, monitor='val_loss', mode='min')
logdir = "logs/model_/"
tensorboard_callback = TensorBoard(log_dir=logdir)

In [0]:
model.compile(
  loss='categorical_crossentropy',
  optimizer=Adam(lr=0.01),
  metrics=['accuracy']
)

In [50]:
print('Training model...')
r = model.fit(
  encoded_train,
  y_train,
  batch_size=512,
  epochs=10,
  validation_data=(encoded_test,y_test),callbacks=[tensorboard_callback,model_2]
)

Training model...
Train on 34311 samples, validate on 8578 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
%load_ext tensorboard

In [0]:
%tensorboard --logdir logs

In [0]:
predictions = model.predict(encoded_test)

In [53]:
print(metrics.classification_report(np.argmax(y_test,axis=1),np.argmax(predictions,axis=1)))

              precision    recall  f1-score   support

           0       0.52      0.60      0.56       182
           1       0.50      0.71      0.59       399
           2       0.52      0.44      0.48       204
           3       1.00      0.03      0.07        59
           4       0.33      0.37      0.34       142
           5       0.94      0.89      0.92       412
           6       0.94      0.49      0.65       206
           7       0.79      0.44      0.57       204
           8       0.92      0.89      0.90        88
           9       0.78      0.85      0.81       185
          10       0.65      0.76      0.70       201
          11       0.66      0.41      0.50       185
          12       0.75      0.27      0.40       293
          13       0.42      0.37      0.39       190
          14       0.66      0.56      0.61       300
          15       0.89      0.98      0.93      4601
          16       0.77      0.56      0.65       285
          17       0.76    