![HAN](https://github.com/ShawnyXiao/TextClassification-Keras/raw/master/image/HAN.png)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [1]:
# Importing Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.layers import Input, Dense, TimeDistributed, GRU, Embedding, Dropout, Bidirectional, LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.optimizers import Adam, RMSprop, SGD
from keras.callbacks import TensorBoard, ModelCheckpoint

Using TensorFlow backend.


In [0]:
# Attention 

from keras import backend as K
from keras import initializers, regularizers, constraints
from keras.engine.topology import Layer


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2
            hidden = LSTM(64, return_sequences=True)(words)
            sentence = Attention()(hidden)
            # next add a Dense layer (for classification/regression) or whatever...
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0

        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        e = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))  # e = K.dot(x, self.W)
        if self.bias:
            e += self.b
        e = K.tanh(e)

        a = K.exp(e)
        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())
        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)

        c = K.sum(a * x, axis=1)
        return c

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim

In [0]:
data = pd.read_csv("drive/My Drive/MI22_cleaned2.csv")

In [0]:
data.drop("Unnamed: 0",axis=1,inplace=True)

In [0]:
train, test = train_test_split(data,test_size=0.2,random_state=123)

In [0]:
# Preparaing Targets
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
encoder.fit(train["label"].values)
y_train = encoder.transform(train["label"].values)
y_test = encoder.transform(test["label"].values)

In [0]:
max_features = 5000
maxlen_sentence = 250
maxlen_word = 10
batch_size = 512
embedding_dims = 300
epochs = 10

In [0]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train["para"])
sequences_train = tokenizer.texts_to_sequences(train["para"])
sequences_test = tokenizer.texts_to_sequences(test["para"])

In [33]:
print('Pad sequences (samples x #sentence x #word)...')
x_train = pad_sequences(sequences_train, maxlen=maxlen_sentence * maxlen_word)
x_test = pad_sequences(sequences_test, maxlen=maxlen_sentence * maxlen_word)
x_train = x_train.reshape((len(x_train), maxlen_sentence, maxlen_word))
x_test = x_test.reshape((len(x_test), maxlen_sentence, maxlen_word))
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x #sentence x #word)...
x_train shape: (34311, 250, 10)
x_test shape: (8578, 250, 10)


In [20]:
print('Loading word vectors...')
word2vec = {}
with open("drive/My Drive/ content glove glove.42B.300d.txt") as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 1917494 word vectors.


In [21]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

Found 324477 unique tokens.


In [22]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(max_features, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, embedding_dims))
for word, i in word2idx.items():
  if i < max_features:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

Filling pre-trained embeddings...
(5000, 300)


In [0]:
input_word = Input(shape=(maxlen_word,))
x_word = Embedding(max_features, embedding_dims, input_length=maxlen_word,weights=[embedding_matrix],trainable=False)(input_word)
word = Bidirectional(LSTM(128, return_sequences=True))(x_word)  # LSTM or GRU
x_word = Attention(maxlen_word)(x_word)
model_word = Model(input_word, x_word)

# Sentence part
input = Input(shape=(maxlen_sentence, maxlen_word))
x_sentence = TimeDistributed(model_word)(input)
x_sentence = Bidirectional(LSTM(128, return_sequences=True))(x_sentence)  # LSTM or GRU
x_sentence = Attention(maxlen_sentence)(x_sentence)

output = Dense(19, activation="softmax")(x_sentence)
model = Model(inputs=input, outputs=output)

In [0]:
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

In [0]:
# Callbacks
model_1 = ModelCheckpoint('model_han.h5', save_best_only=True, monitor='val_loss', mode='min')
logdir = "logs/model_han/"
tensorboard_callback = TensorBoard(log_dir=logdir)

In [39]:
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 250, 10)           0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 250, 300)          1500310   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 250, 256)          439296    
_________________________________________________________________
attention_6 (Attention)      (None, 256)               506       
_________________________________________________________________
dense_3 (Dense)              (None, 19)                4883      
Total params: 1,944,995
Trainable params: 444,995
Non-trainable params: 1,500,000
_________________________________________________________________


In [42]:
history = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=20,
          callbacks=[model_1,tensorboard_callback],
          validation_data=(x_test, y_test))

Train on 34311 samples, validate on 8578 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [0]:
predictions = model.predict(x_test,batch_size=512)

In [44]:
print(metrics.classification_report(np.argmax(y_test,axis=1),np.argmax(predictions,axis=1)))

              precision    recall  f1-score   support

           0       0.69      0.45      0.54       182
           1       0.56      0.65      0.60       399
           2       0.65      0.50      0.56       204
           3       0.78      0.47      0.59        59
           4       0.37      0.34      0.35       142
           5       0.91      0.86      0.88       412
           6       0.90      0.71      0.79       206
           7       0.55      0.50      0.52       204
           8       0.90      0.91      0.90        88
           9       0.84      0.89      0.87       185
          10       0.67      0.74      0.70       201
          11       0.58      0.51      0.55       185
          12       0.53      0.57      0.55       293
          13       0.60      0.48      0.54       190
          14       0.57      0.58      0.58       300
          15       0.93      0.97      0.95      4601
          16       0.76      0.65      0.70       285
          17       0.82    