## Deep Learning Model - LSTM Multi-Label Text Classification

In [1]:
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv1D, Conv2D, MaxPooling2D, GlobalMaxPooling1D, MaxPool1D, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM
from keras.utils import to_categorical
from keras import backend as K


import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import spacy
# from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

Using plaidml.keras.backend backend.


### Loading Data

In [2]:
X_train_Q1 = pd.read_excel('../data/interim/X_train_Q1_clean.xlsx')
X_valid_Q1 = pd.read_excel('../data/interim/X_valid_Q1_clean.xlsx')

y_train_Q1 = pd.read_excel('../data/interim/y_train_Q1.xlsx')
y_valid_Q1 = pd.read_excel('../data/interim/y_valid_Q1.xlsx')

### Creating a Unified Dataframe for LSTM Ready Model

In [3]:
df = pd.concat([X_train_Q1, y_train_Q1.iloc[:,0:12]], axis = 1)

In [4]:
df.head()

Unnamed: 0,Comment,CPD,CB,EWC,Exec,FEW,SP,RE,Sup,SW,TEPE,VMG,OTH
0,"to be real about diversity, you need to create...",0,0,1,0,0,0,0,0,0,0,0,0
1,Keep the building warmer and provide warm wate...,0,0,0,0,0,0,0,0,0,1,0,0
2,better communication from the top down,0,0,0,1,0,0,0,0,0,0,0,0
3,It would be beneficial if Management did not m...,0,0,0,0,0,0,1,0,0,0,0,0
4,more education applicable to my job,1,0,0,0,0,0,0,0,0,0,0,0


In [5]:
data_df = df

In [6]:
data_df.shape

(10376, 13)

### Pre-processing

In [7]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [8]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [9]:
from tqdm import tqdm
preprocessed_synopsis = []
# tqdm is for printing the status bar
for sentance in data_df['Comment'].values:
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_synopsis.append(sentance.strip())
data_df['preprocessed_comments']=preprocessed_synopsis

In [10]:
data_df.head()

Unnamed: 0,Comment,CPD,CB,EWC,Exec,FEW,SP,RE,Sup,SW,TEPE,VMG,OTH,preprocessed_comments
0,"to be real about diversity, you need to create...",0,0,1,0,0,0,0,0,0,0,0,0,real diversity you need create seats table mea...
1,Keep the building warmer and provide warm wate...,0,0,0,0,0,0,0,0,0,1,0,0,keep building warmer provide warm water bathroom
2,better communication from the top down,0,0,0,1,0,0,0,0,0,0,0,0,better communication top
3,It would be beneficial if Management did not m...,0,0,0,0,0,0,1,0,0,0,0,0,would beneficial management not micro manage
4,more education applicable to my job,1,0,0,0,0,0,0,0,0,0,0,0,education applicable job


### Splitting into Train and Test

In [15]:
X_train = data_df[['preprocessed_comments']]
y_train = data_df.drop(['Comment', 'preprocessed_comments'], axis=1)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Preparing Labels

In [12]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(","), binary='true')

In [None]:
# y_train = vectorizer.fit_transform(y_train['tags']).toarray()
# y_test=vectorizer.transform(y_test['tags']).toarray()

In [13]:
def max_len(x):
    a=x.split()
    return len(a)

max_len = max(data_df['Comment'].apply(max_len))
max_len

150

### Vocab Size

In [16]:
vect=Tokenizer()
vect.fit_on_texts(X_train['preprocessed_comments'])
vocab_size = len(vect.word_index) + 1
print(vocab_size)

11919


## Modelling LSTM

#### Padding to make all sequences of same length

**Training Data**

In [17]:
encoded_docs_train = vect.texts_to_sequences(X_train['preprocessed_comments'])
max_length = vocab_size
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_len, padding='post')
print(padded_docs_train)

[[  504   585    36 ...     0     0     0]
 [  134    54  3393 ...     0     0     0]
 [    7    29   234 ...     0     0     0]
 ...
 [  697     4    12 ...     0     0     0]
 [  476  2745 11917 ...     0     0     0]
 [ 1147   593   791 ...     0     0     0]]


In [18]:
padded_docs_train.shape

(10376, 150)

**Test Data**

In [19]:
encoded_docs_test =  vect.texts_to_sequences(X_test['preprocessed_comments'])
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_len, padding='post')

#encoded_docs_cv = vect.texts_to_sequences(cv['preprocessed_plots'])
#padded_docs_cv = pad_sequences(encoded_docs_cv, maxlen=1200, padding='post')

NameError: name 'X_test' is not defined

In [None]:
padded_docs_test.shape

#### Defining Model

In [None]:
n_classes = 12

In [None]:
model = Sequential()
# Configuring the parameters
model.add(Embedding(vocab_size, output_dim=50, input_length=1200))
model.add(LSTM(128, return_sequences=True))  
# Adding a dropout layer
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
# Adding a dense output layer with sigmoid activation
model.add(Dense(n_classes, activation='sigmoid'))
#model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(padded_docs_train, y_train,
                    class_weight='balanced',
                    epochs=5,
                    batch_size=12,
                    verbose=1,
                    validation_data=(padded_docs_test, y_test))

In [None]:
padded_docs_train[0].shape

In [20]:
model = Sequential()

# Configuring the parameters
model.add(Embedding(vocab_size, output_dim=50, input_length=max_len))
model.add(LSTM(120, return_sequences=True))  
# Adding a dropout layer
model.add(Dropout(0.1))
model.add(LSTM(64))
model.add(Dropout(0.1))
# model.add(Dense(32, activation='relu'))
# model.add(Dropout(0.2))
# Adding a dense output layer with sigmoid activation
model.add(Dense(12, activation='sigmoid'))
model.summary()

INFO:plaidml:Opening device "metal_amd_radeon_pro_555x.0"


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 50)           595950    
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 120)          82080     
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 120)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                47360     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 12)                780       
Total params: 726,170
Trainable params: 726,170
Non-trainable params: 0
_________________________________________________________________


In [None]:
# model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
#              metrics=['accuracy'])

In [21]:
y_train_orignal = np.array(df.iloc[:,1:-1])

In [22]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

  return array(obj, copy=False)


In [23]:
model.fit(padded_docs_train, y_train_orignal, epochs=1, validation_split=0.15)

Train on 8819 samples, validate on 1557 samples
Epoch 1/1


INFO:plaidml:Analyzing Ops: 3405 of 30423 operations complete
INFO:plaidml:Analyzing Ops: 7785 of 30423 operations complete
INFO:plaidml:Analyzing Ops: 12789 of 30423 operations complete
INFO:plaidml:Analyzing Ops: 19814 of 30423 operations complete
INFO:plaidml:Analyzing Ops: 25453 of 30423 operations complete
INFO:plaidml:Analyzing Ops: 29728 of 30423 operations complete




  return array(obj, copy=False)




INFO:plaidml:Analyzing Ops: 3045 of 30424 operations complete
INFO:plaidml:Analyzing Ops: 7510 of 30424 operations complete
INFO:plaidml:Analyzing Ops: 12298 of 30424 operations complete
INFO:plaidml:Analyzing Ops: 19224 of 30424 operations complete
INFO:plaidml:Analyzing Ops: 24973 of 30424 operations complete
INFO:plaidml:Analyzing Ops: 29114 of 30424 operations complete
INFO:plaidml:Analyzing Ops: 3901 of 12541 operations complete
INFO:plaidml:Analyzing Ops: 10744 of 12541 operations complete
INFO:plaidml:Analyzing Ops: 6532 of 12542 operations complete




<keras.callbacks.History at 0x1a2faa4890>

## LSTM with Glove

In [24]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/Users/karan/Downloads/glove/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [25]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in vect.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [26]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.11619   ,  0.45447001, -0.69216001, ..., -0.54737002,
         0.48822001,  0.32246   ],
       [-0.19103999,  0.17601   ,  0.36919999, ..., -0.59680003,
         0.080843  ,  0.27866   ],
       ...,
       [-0.34926   ,  0.27006999, -0.52661002, ...,  0.22747   ,
        -0.12559   ,  0.70643002],
       [-0.53812999,  0.72706997,  0.074018  , ..., -0.41005999,
         1.08850002,  0.75314999],
       [-1.51540005,  0.66566002,  0.23134001, ...,  0.47402   ,
         0.84129   ,  0.94787002]])

In [30]:
model = Sequential()

# Configuring the parameters
model.add(Embedding(vocab_size, 100, input_length=max_len, weights=[embedding_matrix], trainable=False))

model.add(LSTM(120, return_sequences=True))  

# Adding a dropout layer
model.add(Dropout(0.1))
model.add(LSTM(64))
model.add(Dropout(0.1))

# model.add(Dense(32, activation='relu'))
# model.add(Dropout(0.2))
# Adding a dense output layer with sigmoid activation

model.add(Dense(12, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 150, 100)          1191900   
_________________________________________________________________
lstm_3 (LSTM)                (None, 150, 120)          106080    
_________________________________________________________________
dropout_3 (Dropout)          (None, 150, 120)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                47360     
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 12)                780       
Total params: 1,346,120
Trainable params: 154,220
Non-trainable params: 1,191,900
____________________________________________________________

In [31]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return array(obj, copy=False)


In [34]:
# Train Model
model.fit(padded_docs_train, y_train, batch_size=128, epochs=1, validation_split=0.15)

  return array(obj, copy=False)


Train on 8819 samples, validate on 1557 samples
Epoch 1/1


INFO:plaidml:Analyzing Ops: 1765 of 29060 operations complete
INFO:plaidml:Analyzing Ops: 6553 of 29060 operations complete
INFO:plaidml:Analyzing Ops: 10474 of 29060 operations complete
INFO:plaidml:Analyzing Ops: 17388 of 29060 operations complete
INFO:plaidml:Analyzing Ops: 24271 of 29060 operations complete




INFO:plaidml:Analyzing Ops: 3034 of 29060 operations complete
INFO:plaidml:Analyzing Ops: 7472 of 29060 operations complete
INFO:plaidml:Analyzing Ops: 12171 of 29060 operations complete
INFO:plaidml:Analyzing Ops: 19409 of 29060 operations complete
INFO:plaidml:Analyzing Ops: 25173 of 29060 operations complete
INFO:plaidml:Analyzing Ops: 5036 of 12542 operations complete
INFO:plaidml:Analyzing Ops: 11856 of 12542 operations complete
INFO:plaidml:Analyzing Ops: 4061 of 12542 operations complete
INFO:plaidml:Analyzing Ops: 11066 of 12542 operations complete




<keras.callbacks.History at 0x1a37aaef50>

## LSTM with Universal Sentence Encorder

In [35]:
import tensorflow as tf
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

embeddings = embed(X_train['preprocessed_comments'])
embedding_matrix = np.array(embeddings)
embedding_matrix

array([[-0.03669859, -0.07417478,  0.03231422, ...,  0.01621998,
         0.00728293,  0.00265464],
       [ 0.05218186, -0.01258108, -0.04657765, ...,  0.05539099,
         0.05895472, -0.0171202 ],
       [-0.01777638, -0.04568463,  0.00473103, ..., -0.00073464,
        -0.08261821,  0.05859691],
       ...,
       [ 0.03823395,  0.01441879,  0.06019125, ...,  0.06164353,
        -0.02452262, -0.01176131],
       [-0.03514517, -0.04318713, -0.02938136, ...,  0.0648592 ,
        -0.05883494,  0.01129983],
       [ 0.02614759, -0.05074428,  0.00971   , ...,  0.01864268,
         0.05439513,  0.03895477]], dtype=float32)

In [36]:
embedding_matrix.shape

(10376, 512)

In [9]:
X_train = data_df[['Comment']]
y_train = data_df.drop(columns='Comment')

In [14]:
# saving embeddings as pickle file
import tensorflow as tf
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed(X_train['Comment'])
embedding_matrix = np.array(embeddings)

#embedding_matrix.save('../models/use_x_train_embeddings.pkl')
np.save('../models/use_x_train_embeddings', embedding_matrix)

In [16]:
# saving y_train as pickle
y_train = np.array(y_train)

In [17]:
np.save('../models/y_train', y_train)

In [18]:
# saving X_valid data

df_valid = pd.concat([X_valid_Q1, y_valid_Q1.iloc[:,:12]], axis = 1)

In [20]:
X_valid = df_valid[['Comment']]
y_valid = df_valid.drop(columns='Comment')

In [21]:
embeddings_valid = embed(X_valid['Comment'])
embedding_matrix_valid = np.array(embeddings_valid)
np.save('../models/use_x_valid_embeddings', embedding_matrix_valid)

In [22]:
# saving y_valid
y_valid = np.array(y_valid)
np.save('../models/y_valid', y_valid)

In [37]:
max_features = embedding_matrix.shape[0]
maxlen = max_len
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 1
embed_size = 512 # for universal sentence encoder
n_class = 12

In [41]:
model = Sequential()

# Configuring the parameters
model.add(Embedding(max_features, embed_size, input_length=embed_size))

model.add(LSTM(120, return_sequences=True))  

# Adding a dropout layer
model.add(Dropout(0.1))
model.add(LSTM(64))
model.add(Dropout(0.1))

# model.add(Dense(32, activation='relu'))
# model.add(Dropout(0.2))
# Adding a dense output layer with sigmoid activation

model.add(Dense(12, activation='sigmoid'))
model.summary()

  return array(obj, copy=False)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 512, 512)          5312512   
_________________________________________________________________
lstm_7 (LSTM)                (None, 512, 120)          303840    
_________________________________________________________________
dropout_7 (Dropout)          (None, 512, 120)          0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 64)                47360     
_________________________________________________________________
dropout_8 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 12)                780       
Total params: 5,664,492
Trainable params: 5,664,492
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
model.fit(embedding_matrix, y_train, batch_size=batch_size, epochs=epochs,
          validation_split=0.15)

  return array(obj, copy=False)


Train on 8819 samples, validate on 1557 samples
Epoch 1/1


INFO:plaidml:Analyzing Ops: 1475 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 1475 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 2621 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 2621 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 3827 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 3827 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 5081 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 5081 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 6409 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 6409 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 7882 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 7882 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 9506 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 9506 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 11226 of 103185 operations complete
INFO:plaidml:Analyzing Ops: 11226 of 103185 operations