## Deep Learning Model - CNN Multi-Label Text Classification - GLOVE

In [54]:
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv1D, Conv2D, MaxPooling2D, GlobalMaxPooling1D, MaxPool1D, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM
from keras.utils import to_categorical
from keras import backend as K


import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import spacy
# from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

### Loading Data

In [2]:
X_train_Q1 = pd.read_excel('../data/interim/X_train_Q1_clean.xlsx')
X_valid_Q1 = pd.read_excel('../data/interim/X_valid_Q1_clean.xlsx')

y_train_Q1 = pd.read_excel('../data/interim/y_train_Q1.xlsx')
y_valid_Q1 = pd.read_excel('../data/interim/y_valid_Q1.xlsx')

### Creating a Unified Dataframe for CNN Ready Model

In [3]:
df = pd.concat([X_train_Q1, y_train_Q1.iloc[:,0:12]], axis = 1)

In [4]:
df.head()

Unnamed: 0,Comment,CPD,CB,EWC,Exec,FEW,SP,RE,Sup,SW,TEPE,VMG,OTH
0,"to be real about diversity, you need to create...",0,0,1,0,0,0,0,0,0,0,0,0
1,Keep the building warmer and provide warm wate...,0,0,0,0,0,0,0,0,0,1,0,0
2,better communication from the top down,0,0,0,1,0,0,0,0,0,0,0,0
3,It would be beneficial if Management did not m...,0,0,0,0,0,0,1,0,0,0,0,0
4,more education applicable to my job,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# def combine_labels(list):
#     all_labels= ''
#     for tag in list:
#         all_labels = all_labels + str(tag) + ','
#     return all_labels[:-1]

In [None]:
# main_list_labels = list()

# for i in range(len(df)):
#     labels = np.where(df.iloc[i,1:] ==1,df.iloc[0,1:].index,0)
#     names = labels[np.nonzero(labels)]
#     main_list_labels.append(combine_labels(names))

In [None]:
# df['tags'] = main_list_labels

In [None]:
# data_df = df[['Comment','tags']]

In [5]:
data_df = df

In [6]:
data_df.shape

(10376, 13)

### Pre-processing

#### General

In [60]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [61]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [9]:
from tqdm import tqdm
preprocessed_synopsis = []
# tqdm is for printing the status bar
for sentance in data_df['Comment'].values:
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_synopsis.append(sentance.strip())
data_df['preprocessed_comments']=preprocessed_synopsis

In [10]:
data_df.head()

Unnamed: 0,Comment,CPD,CB,EWC,Exec,FEW,SP,RE,Sup,SW,TEPE,VMG,OTH,preprocessed_comments
0,"to be real about diversity, you need to create...",0,0,1,0,0,0,0,0,0,0,0,0,real diversity you need create seats table mea...
1,Keep the building warmer and provide warm wate...,0,0,0,0,0,0,0,0,0,1,0,0,keep building warmer provide warm water bathroom
2,better communication from the top down,0,0,0,1,0,0,0,0,0,0,0,0,better communication top
3,It would be beneficial if Management did not m...,0,0,0,0,0,0,1,0,0,0,0,0,would beneficial management not micro manage
4,more education applicable to my job,1,0,0,0,0,0,0,0,0,0,0,0,education applicable job


#### Our Pre-processor

In [None]:
def preprocess_comments(text, 
                        min_token_len = 2, 
                        irrelevant_pos = ['PRON', 'SPACE', 'PUNCT', 'ADV', 
                                          'ADP', 'CCONJ', 'AUX', 'PRP'],
                        avoid_entities = ['PERSON', 'ORG', 'LOC', 'GPE']):
# note: Didn't use the following options in the `preprocess_comments`...
#    - 'PROPN' because it erases proper names as 'George', but also words as orange.
#    - 'DET' since it removes the word 'no', which changes the meaning of a sentence.
# *for more information see link: https://universaldependencies.org/u/pos/
    """
    Given text, min_token_len, irrelevant_pos and avoid_entities, carries out 
    preprocessing of the text and returns list of preprocessed text. 
    
    Parameters
    -------------
    text : (list) 
        the list of text to be preprocessed
    min_token_len : (int) 
        min_token_length required
    irrelevant_pos : (list) 
        a list of irrelevant pos tags
    avoid_entities : (list)
        a list of entity labels to be avoided
    
    Returns
    -------------
    (list) list of preprocessed text
    
    Example
    -------------
    >>> example = ["Hello, I'm George and I love swimming!",
                   "I am a really good cook; what about you?",
                   "Contact me at george23@gmail.com"]

    >>> preprocess(example)
    (output:) ['hello love swimming', 'good cook', 'contact']
    """

    result = []
    
    others = ["'s", "the", "that", "this", "to", "-PRON-"]
    # I add "-PRON-" that erase "my", "your", etc. other way to erase them is to
    #   use adding 'DET' to irrelevant_pos but it would erase the word 'no' too.
    
    for sent in text:
        sent = sent.lower()
        sent = re.sub(r"facebook", "social media", sent)
        sent = re.sub(r"twitter", "social media", sent)
        sent = re.sub(r"instagram", "social media", sent)
        sent = re.sub(r"whatsapp", "social media", sent)
        sent = re.sub(r"linkedin", "social media", sent)
        sent = re.sub(r"snapchat", "social media", sent)
        
        result_sent = []
        doc = nlp(sent)
        entities = [str(ent) for ent in doc.ents if ent.label_ in avoid_entities]
        # This helps to detect names of persons, organization and dates
        
        for token in doc:            
            if (token.like_email or
                token.like_url or
                token.pos_ in irrelevant_pos or
                str(token) in entities or
                str(token.lemma_) in others or
                len(token) < min_token_len):
                continue
            else:
                result_sent.append(token.lemma_)
        result.append(" ".join(result_sent))
    return result

In [None]:
data_df['preprocessed_comments']=preprocess_comments(data_df['Comment'])

In [None]:
data_df.head()

In [None]:
data_df.shape

**Extra Code to Clean Labels : Not needed as of now**

In [None]:
# def remove_spaces(x):
#     x=x.split(",")
#     nospace=[]
#     for item in x:
#         item=item.lstrip()
#         nospace.append(item)
#     return (",").join(nospace)

In [None]:
# data_df['tags'].apply(remove_spaces).head(10)

### Splitting into Train and Test

In [None]:
data_df.head()

In [11]:
X_train = data_df[['preprocessed_comments']]
y_train = data_df.drop(['Comment', 'preprocessed_comments'], axis=1)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Preparing Labels

**Max length of sentence**

In [12]:
def max_len(x):
    a=x.split()
    return len(a)

max_len = max(data_df['Comment'].apply(max_len))
max_len

150

### Vocab Size

**For General Use**

In [13]:
vect=Tokenizer()
vect.fit_on_texts(X_train['preprocessed_comments'])
vocab_size = len(vect.word_index) + 1
print(vocab_size)

11919


**Bag of Words**

In [None]:
# vectorizer = CountVectorizer(tokenizer = lambda x: x.split(","), binary='true')

In [None]:
# y_train = vectorizer.fit_transform(y_train['tags']).toarray()
# y_test=vectorizer.transform(y_test['tags']).toarray()

**Glove**

In [37]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/Users/karan/Downloads/glove/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [38]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in vect.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [39]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.11619   ,  0.45447001, -0.69216001, ..., -0.54737002,
         0.48822001,  0.32246   ],
       [-0.19103999,  0.17601   ,  0.36919999, ..., -0.59680003,
         0.080843  ,  0.27866   ],
       ...,
       [-0.34926   ,  0.27006999, -0.52661002, ...,  0.22747   ,
        -0.12559   ,  0.70643002],
       [-0.53812999,  0.72706997,  0.074018  , ..., -0.41005999,
         1.08850002,  0.75314999],
       [-1.51540005,  0.66566002,  0.23134001, ...,  0.47402   ,
         0.84129   ,  0.94787002]])

**Universal Sentence Encoder**

In [70]:
import tensorflow as tf
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [71]:
embeddings = embed(X_train['preprocessed_comments'])

In [72]:
embedding_matrix = np.array(embeddings)

In [73]:
embedding_matrix

array([[-0.03669859, -0.07417478,  0.03231422, ...,  0.01621998,
         0.00728293,  0.00265464],
       [ 0.05218186, -0.01258108, -0.04657765, ...,  0.05539099,
         0.05895472, -0.0171202 ],
       [-0.01777638, -0.04568463,  0.00473103, ..., -0.00073464,
        -0.08261821,  0.05859691],
       ...,
       [ 0.03823395,  0.01441879,  0.06019125, ...,  0.06164353,
        -0.02452262, -0.01176131],
       [-0.03514517, -0.04318713, -0.02938136, ...,  0.0648592 ,
        -0.05883494,  0.01129983],
       [ 0.02614759, -0.05074428,  0.00971   , ...,  0.01864268,
         0.05439513,  0.03895477]], dtype=float32)

In [74]:
embedding_matrix.shape

(10376, 512)

## Modelling CNN

#### Padding to make all sequences of same length

**Training Data**

In [40]:
encoded_docs_train = vect.texts_to_sequences(X_train['preprocessed_comments'])
max_length = vocab_size
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_len, padding='post')
print(padded_docs_train)

[[  504   585    36 ...     0     0     0]
 [  134    54  3393 ...     0     0     0]
 [    7    29   234 ...     0     0     0]
 ...
 [  697     4    12 ...     0     0     0]
 [  476  2745 11917 ...     0     0     0]
 [ 1147   593   791 ...     0     0     0]]


In [19]:
padded_docs_train.shape

(10376, 150)

**Test Data**

In [None]:
padded_docs_train.shape

In [None]:
encoded_docs_test =  vect.texts_to_sequences(X_test['preprocessed_comments'])
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=1200, padding='post')

#encoded_docs_cv = vect.texts_to_sequences(cv['preprocessed_plots'])
#padded_docs_cv = pad_sequences(encoded_docs_cv, maxlen=1200, padding='post')

#### Defining Model

## Glove

In [41]:
y_train = np.array(y_train)

In [42]:
max_features = embedding_matrix.shape[0]
maxlen = max_len
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 1
embed_size = 100 # for glove we are using 100d dataset
n_class = 12

In [55]:
model = Sequential()

model.add(Embedding(max_features, embed_size, weights=[embedding_matrix],
                        trainable=False, input_length=maxlen))

model.add(Dropout(0.2))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu',
                 strides=1))
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, padding='valid',activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(hidden_dims, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(n_class, activation = 'sigmoid'))


model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 150, 100)          1191900   
_________________________________________________________________
dropout_16 (Dropout)         (None, 150, 100)          0         
_________________________________________________________________
conv1d_33 (Conv1D)           (None, 148, 250)          75250     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 74, 250)           0         
_________________________________________________________________
conv1d_34 (Conv1D)           (None, 72, 250)           187750    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 36, 250)           0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 9000)              0         
__________

In [56]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
model.fit(padded_docs_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_split=0.15)

Train on 8819 samples, validate on 1557 samples
Epoch 1/1


<keras.callbacks.History at 0x1a2d1d4390>

**Evaluation on Validation data**

In [57]:
df_valid = pd.concat([X_valid_Q1, y_valid_Q1.iloc[:,0:12]], axis = 1)

In [62]:
# pre-processing test data
from tqdm import tqdm
preprocessed_synopsis = []
# tqdm is for printing the status bar
for sentance in df_valid['Comment'].values:
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_synopsis.append(sentance.strip())
    
    
df_valid['preprocessed_comments']=preprocessed_synopsis

In [63]:
df_valid.head()

Unnamed: 0,Comment,CPD,CB,EWC,Exec,FEW,SP,RE,Sup,SW,TEPE,VMG,OTH,preprocessed_comments
0,Change management and articulating a clear vis...,0,0,0,1,0,0,0,0,0,0,0,0,change management articulating clear vision fu...
1,"More interaction with Management, not this us ...",0,0,0,0,0,0,0,1,0,0,0,0,interaction management not us
2,We switched to safetyline for field monitoring...,0,0,0,0,0,0,0,0,1,0,0,0,switched safetyline field monitoring not allow...
3,JCM's used to be treated as the Managers they ...,0,0,1,0,0,0,0,0,0,0,0,0,jcm used treated managers past plus years trea...
4,actively providing the adequate training/knowl...,0,0,1,0,0,0,0,0,0,0,0,0,actively providing adequate training knowledge...


In [65]:
# creating X and Y

X_valid = df_valid[['preprocessed_comments']]
y_valid = df_valid.drop(columns=['Comment', 'preprocessed_comments'])

In [66]:
y_valid = np.array(y_valid)

In [67]:
# creating padded dataset for x_valid
encoded_docs_valid = vect.texts_to_sequences(X_valid['preprocessed_comments'])
padded_docs_valid = pad_sequences(encoded_docs_valid, maxlen=max_len, padding='post')
print(padded_docs_valid)

[[  64   16 9255 ...    0    0    0]
 [1778   16    2 ...    0    0    0]
 [3795 6456  324 ...    0    0    0]
 ...
 [ 689    1   71 ...    0    0    0]
 [ 232   45  591 ...    0    0    0]
 [  31  688  534 ...    0    0    0]]


In [68]:
score = model.evaluate(padded_docs_valid,y_valid)



In [69]:
score

[0.30951790101452803, 0.8948856521406445]

## Universal Sentence Encoder

In [76]:
y_train = np.array(y_train)

max_features = embedding_matrix.shape[0]
maxlen = max_len
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 1
embed_size = 512 # for universal sentence encoder
n_class = 12

In [77]:
model = Sequential()

model.add(Embedding(max_features, embed_size, weights=[embedding_matrix],
                        trainable=False, input_length=maxlen))

model.add(Dropout(0.2))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu',
                 strides=1))
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, padding='valid',activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(hidden_dims, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(n_class, activation = 'sigmoid'))


model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 150, 512)          5312512   
_________________________________________________________________
dropout_18 (Dropout)         (None, 150, 512)          0         
_________________________________________________________________
conv1d_35 (Conv1D)           (None, 148, 250)          384250    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 74, 250)           0         
_________________________________________________________________
conv1d_36 (Conv1D)           (None, 72, 250)           187750    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 36, 250)           0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 9000)              0         
__________

In [78]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
model.fit(padded_docs_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_split=0.05)

Train on 9857 samples, validate on 519 samples
Epoch 1/1
1024/9857 [==>...........................] - ETA: 58:04 - loss: 0.4702 - acc: 0.8188

ERROR:plaidml:Caused GPU Timeout Error (IOAF code 2)
ERROR:plaidml:Caused GPU Timeout Error (IOAF code 2)




  return array(obj, copy=False)




<keras.callbacks.History at 0x1abdaad510>

**Score on Validation Data**

In [79]:
score = model.evaluate(padded_docs_valid,y_valid)



In [80]:
score

[0.3709800519342136, 0.8837702603365885]