## Deep Learning Model - LSTM Multi-Label Text Classification

In [6]:
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM
from keras import backend as K


import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

### Loading Data

In [2]:
X_train_Q1 = pd.read_excel('../data/interim/X_train_Q1_clean.xlsx')
X_valid_Q1 = pd.read_excel('../data/interim/X_valid_Q1_clean.xlsx')

y_train_Q1 = pd.read_excel('../data/interim/y_train_Q1.xlsx')
y_valid_Q1 = pd.read_excel('../data/interim/y_valid_Q1.xlsx')

### Creating a Unified Dataframe for LSTM Ready Model

In [3]:
df = pd.concat([X_train_Q1, y_train_Q1.iloc[:,0:12]], axis = 1)

In [4]:
df.head()

Unnamed: 0,Comment,CPD,CB,EWC,Exec,FEW,SP,RE,Sup,SW,TEPE,VMG,OTH
0,"to be real about diversity, you need to create...",0,0,1,0,0,0,0,0,0,0,0,0
1,Keep the building warmer and provide warm wate...,0,0,0,0,0,0,0,0,0,1,0,0
2,better communication from the top down,0,0,0,1,0,0,0,0,0,0,0,0
3,It would be beneficial if Management did not m...,0,0,0,0,0,0,1,0,0,0,0,0
4,more education applicable to my job,1,0,0,0,0,0,0,0,0,0,0,0


In [5]:
def combine_labels(list):
    all_labels= ''
    for tag in list:
        all_labels = all_labels + str(tag) + ','
    return all_labels[:-1]

In [6]:
main_list_labels = list()

for i in range(len(df)):
    labels = np.where(df.iloc[i,1:] ==1,df.iloc[0,1:].index,0)
    names = labels[np.nonzero(labels)]
    main_list_labels.append(combine_labels(names))

In [7]:
df['tags'] = main_list_labels

In [8]:
data_df = df[['Comment','tags']]

In [9]:
data_df.head(10)

Unnamed: 0,Comment,tags
0,"to be real about diversity, you need to create...",EWC
1,Keep the building warmer and provide warm wate...,TEPE
2,better communication from the top down,Exec
3,It would be beneficial if Management did not m...,RE
4,more education applicable to my job,CPD
5,Allocating resources appropriately. It feels a...,VMG
6,Stop moving forward with open concept offices.,FEW
7,Go back to the old way of doing computer refre...,"SW,TEPE"
8,greater understanding of the importance of tra...,CPD
9,accountability /measures required when people ...,"FEW,Sup"


In [10]:
data_df.shape

(10376, 2)

### Pre-processing

In [11]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [12]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [13]:
from tqdm import tqdm
preprocessed_synopsis = []
# tqdm is for printing the status bar
for sentance in data_df['Comment'].values:
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_synopsis.append(sentance.strip())
data_df['preprocessed_comments']=preprocessed_synopsis

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [14]:
data_df.head()

Unnamed: 0,Comment,tags,preprocessed_comments
0,"to be real about diversity, you need to create...",EWC,real diversity you need create seats table mea...
1,Keep the building warmer and provide warm wate...,TEPE,keep building warmer provide warm water bathroom
2,better communication from the top down,Exec,better communication top
3,It would be beneficial if Management did not m...,RE,would beneficial management not micro manage
4,more education applicable to my job,CPD,education applicable job


In [15]:
def remove_spaces(x):
    x=x.split(",")
    nospace=[]
    for item in x:
        item=item.lstrip()
        nospace.append(item)
    return (",").join(nospace)

In [16]:
data_df['tags'].apply(remove_spaces).head(10)

0        EWC
1       TEPE
2       Exec
3         RE
4        CPD
5        VMG
6        FEW
7    SW,TEPE
8        CPD
9    FEW,Sup
Name: tags, dtype: object

### Splitting into Train and Test

In [17]:
X = data_df[['preprocessed_comments']]
y = data_df.drop(['Comment', 'preprocessed_comments'], axis=1)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Preparing Labels

In [19]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(","), binary='true')

In [20]:
y_train = vectorizer.fit_transform(y_train['tags']).toarray()
y_test=vectorizer.transform(y_test['tags']).toarray()

In [21]:
def max_len(x):
    a=x.split()
    return len(a)

max(data_df['Comment'].apply(max_len))

150

### Vocab Size

In [22]:
vect=Tokenizer()
vect.fit_on_texts(X_train['preprocessed_comments'])
vocab_size = len(vect.word_index) + 1
print(vocab_size)

10933


## Modelling LSTM

#### Padding to make all sequences of same length

**Training Data**

In [23]:
encoded_docs_train = vect.texts_to_sequences(X_train['preprocessed_comments'])
max_length = vocab_size
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=1200, padding='post')
print(padded_docs_train)

[[5152   77  105 ...    0    0    0]
 [ 393    7 1087 ...    0    0    0]
 [6648   30 2124 ...    0    0    0]
 ...
 [   1   70  300 ...    0    0    0]
 [1471  102    6 ...    0    0    0]
 [ 116  121    6 ...    0    0    0]]


**Test Data**

In [24]:
encoded_docs_test =  vect.texts_to_sequences(X_test['preprocessed_comments'])
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=1200, padding='post')

#encoded_docs_cv = vect.texts_to_sequences(cv['preprocessed_plots'])
#padded_docs_cv = pad_sequences(encoded_docs_cv, maxlen=1200, padding='post')

#### Defining Model

In [25]:
n_classes = 13

In [None]:
model = Sequential()
# Configuring the parameters
model.add(Embedding(vocab_size, output_dim=50, input_length=1200))
model.add(LSTM(128, return_sequences=True))  
# Adding a dropout layer
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
# Adding a dense output layer with sigmoid activation
model.add(Dense(n_classes, activation='sigmoid'))
#model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(padded_docs_train, y_train,
                    class_weight='balanced',
                    epochs=5,
                    batch_size=12,
                    verbose=1,
                    validation_data=(padded_docs_test, y_test))

INFO:plaidml:Opening device "metal_amd_radeon_pro_555x.0"


Train on 8300 samples, validate on 2076 samples
Epoch 1/5


INFO:plaidml:Analyzing Ops: 752 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 1565 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 2408 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 3344 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 3868 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 4193 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 4535 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 4905 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 5255 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 5627 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 6007 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 6435 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 6870 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 7299 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 7765 of 241465 operations complete
INFO:plaidml:Analyzing Ops: 8221 of 241465 operations co