# Getting the Embeddings
### ...for the Advance Models

## Loadading
### Load dependencies

In [1086]:
# General
import pandas as pd
import numpy as np
import time

# Preprocess
import re
import string
import spacy
# from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

# Embeddings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer  #TD-IDF & Bag of Words
from tensorflow.keras.preprocessing.text import Tokenizer  #GloVe
import tensorflow as tf  #Universal Sentence Encoder
import tensorflow_hub as hub  #Universal Sentence Encoder
from sentence_transformers import SentenceTransformer  #BERT

# Models
from keras.preprocessing.sequence import pad_sequences #Glove of CNN
# from keras.models import Sequential #Glove of CNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling2D, GlobalMaxPooling1D, MaxPool1D, MaxPooling1D
from tensorflow.keras.layers import Embedding, LSTM, Input, Lambda
from sklearn.metrics import precision_score, recall_score, f1_score #Precision & Recall


# To use GPU-Accelerated Machine Learning on MacOS
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

### Load datasets

In [1087]:
X_train_Q1 = pd.read_csv('../data/X_train.csv')
X_valid_Q1 = pd.read_csv('../data/X_valid.csv')

y_train_Q1 = pd.read_csv('../data/y_train.csv')
y_valid_Q1 = pd.read_csv('../data/y_valid.csv')

Requirement from BC Stats:$^{1}$

| Level | Goal | Precision | Recall |
|--------|------|-------------|-------|
| Themes | primary (min) | 67% | 67% |
| Themes | stretch (:tada:) | 80% | 80% |
| | | |
| Sub-themes | primary (min) | 50% | 50% |
| Sub-themes | stretch (:tada:) | 67% | 67% |

_(1) These standards were provided as a reference to shape the proposal of the project, but are not mandatory._

_Note:  BC stats comment that they didn’t include Accuracy as a key performance indicator (KPI) because it is so hard to gauge what a "good" result would be._

## Preprocess data

### Preprocess data for: GLOVE, BoW and TF-IDF

In [1088]:
# This is from the previous preprocess made by Sukriti & Vic
X_train_Q1_pp1 = pd.read_csv('../data/X_train_pp.csv')
X_valid_Q1_pp1 = pd.read_csv('../data/X_valid_pp.csv')

### Preprocess data for: USE and BERT

In [1089]:
def pp_light(text, 
             irrelevant_pos = ['SPACE'],
             avoid_entities = ['PERSON', 'ORG', 'LOC', 'GPE']):
    """
    Given text, min_token_len, irrelevant_pos and avoid_entities, carries out 
    preprocessing of the text and returns list of preprocessed text. 
    
    Parameters
    -------------
    text : (list) 
        the list of text to be preprocessed
    irrelevant_pos : (list) 
        a list of irrelevant pos tags
    avoid_entities : (list)
        a list of entity labels to be avoided
    
    Returns
    -------------
    (list) list of preprocessed text
    
    Example
    -------------
    >>> example = ["Hello, I'm George and I love swimming!",
                   "I am a really good cook; what about you?",
                   "Contact me at george23@gmail.com"]

    >>> preprocess(example)
    (output:) ["Hello, I 'm and I love swimming!", 
               'I am a really good cook; what about you?',
               'Contact me at']
    """
    result = []
    
    for sent in text:
        sent = sent.lower()
        sent = re.sub(r"facebook", "social media", sent)
        sent = re.sub(r"twitter", "social media", sent)
        sent = re.sub(r"instagram", "social media", sent)
        sent = re.sub(r"whatsapp", "social media", sent)
        sent = re.sub(r"linkedin", "social media", sent)
        sent = re.sub(r"snapchat", "social media", sent)
        
        result_sent = []
        doc = nlp(sent)
        entities = [str(ent) for ent in doc.ents if ent.label_ in avoid_entities]
        
        for token in doc:            
            if (token.like_email or
                token.like_url or
                token.pos_ in irrelevant_pos or
                str(token) in entities):
                continue
            else:
                if str(token) in string.punctuation:
                    try:
                        result_sent[-1] = str(result_sent[-1]) + str(token)
                    except:
                        result_sent.append(str(token))
                else:
                    result_sent.append(str(token))
        result.append(" ".join(result_sent))
    return result

In [1090]:
X_train_Q1_pp2 = pp_light(X_train_Q1['Comment'])
X_valid_Q1_pp2 = pp_light(X_valid_Q1['Comment'])

## Word Embeddings

In [1091]:
times_embed = []

### Bag of Words

In [1092]:
# Bag of Words
t = time.time()
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train_Q1_pp1['Comment'].values.astype('U'))#
X_valid_bow = vectorizer.transform(X_valid_Q1_pp1['Comment'].values.astype('U'))

In [1093]:
time_bow = time.time() - t
case= {'Embedding': "Bag of Words",
       'Time': time_bow}    
times_embed.append(case)

### TF-IDF

In [1094]:
# Tf-idf
t = time.time()
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_Q1_pp1['Comment'].values.astype('U'))#
X_valid_tfidf = vectorizer.transform(X_valid_Q1_pp1['Comment'].values.astype('U'))

In [1095]:
time_tfidf = time.time() - t
case= {'Embedding': "TF-IDF",
       'Time': time_tfidf}    
times_embed.append(case)

### GloVe

#### Preparing hyperparameters

In [1096]:
t = time.time()

#Max length of sentence
def max_len(x):
    a=str(x).split()
    return len(a)

max_len = max(X_train_Q1_pp1['Comment'].apply(max_len))
print("Max lenght of comment:", max_len) 

# Vocab Size
vect=Tokenizer()
vect.fit_on_texts(X_train_Q1_pp1['Comment'].astype(str))
vocab_size = len(vect.word_index) + 1
print("Vocabulary size:", vocab_size)

Max lenght of comment: 87
Vocabulary size: 8639


#### load embeddings

In [1097]:
# load the whole Glove embedding into memory
embeddings_index = dict()
f = open('/Users/vcuspinera/Documents/UBC/B7_Capstone/Documents_capstone/11_Glove/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


#### matrix of weights

In [1098]:
# To save which we won't see in Glove embedding
not_in_glove = []

# create a weight matrix for words in training docs
embedding_matrix_GLOVE = np.zeros((vocab_size, 300))
for word, i in vect.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_GLOVE[i] = embedding_vector
    else:
        not_in_glove.append(word)

In [1099]:
embedding_matrix_GLOVE.shape

(8639, 300)

In [1100]:
time_glove = time.time() - t
case= {'Embedding': "GloVe",
       'Time': time_glove}    
times_embed.append(case)

#### GloVe: Padding to make all sequences of same length

In [1101]:
encoded_docs_train = vect.texts_to_sequences(X_train_Q1_pp1['Comment'].astype(str))
max_length = vocab_size
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_len, padding='post')
print(padded_docs_train)

[[ 485  532    5 ...    0    0    0]
 [ 108   82 1245 ...    0    0    0]
 [  24   42  265 ...    0    0    0]
 ...
 [ 396    8   23 ...    0    0    0]
 [ 255 2108 8637 ...    0    0    0]
 [ 361  538  596 ...    0    0    0]]


In [1102]:
encoded_docs_valid = vect.texts_to_sequences(X_valid_Q1_pp1['Comment'].astype(str))
max_length = vocab_size
padded_docs_valid = pad_sequences(encoded_docs_valid, maxlen=max_len, padding='post')
print(padded_docs_valid)

[[  29   32 1740 ...    0    0    0]
 [   4  976   32 ...    0    0    0]
 [1176 4630  318 ...    0    0    0]
 ...
 [  52    1  109 ...    0    0    0]
 [  76  850  133 ...    0    0    0]
 [  26  436  445 ...    0    0    0]]


In [1103]:
padded_docs_train.shape

(10376, 87)

#### Saving the encoded information

In [1104]:
# saving the encoded of train and validatin datasets
np.save("../data/embeddings/padded_docs_train", padded_docs_train)
np.save("../data/embeddings/padded_docs_valid", padded_docs_valid)

# saving the embeddings with GloVe
np.save("../data/embeddings/embedding_matrix_GLOVE", embedding_matrix_GLOVE)

### Universal Sentence Encoder

In [1105]:
t = time.time()
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [1106]:
embeddings_USE_train = embed(X_train_Q1_pp2)#.astype(str).tolist())
embeddings_USE_valid = embed(X_valid_Q1_pp2)#.astype(str).tolist())

In [1107]:
embeddings_USE_train

<tf.Tensor: shape=(10376, 512), dtype=float32, numpy=
array([[-0.03905287, -0.07600362,  0.02124164, ...,  0.04003076,
         0.00880638,  0.0135172 ],
       [ 0.05754621,  0.02805524, -0.03187463, ...,  0.05339389,
         0.06803529, -0.02249984],
       [-0.00135626,  0.04400793, -0.01373925, ...,  0.02239931,
        -0.03765979,  0.07158894],
       ...,
       [-0.02539974, -0.05343602,  0.0577434 , ...,  0.01711884,
         0.01079335, -0.00586297],
       [-0.03075003, -0.04403489,  0.01170156, ...,  0.07016593,
        -0.05456427,  0.0272686 ],
       [ 0.02614759, -0.05074428,  0.00971   , ...,  0.01864268,
         0.05439513,  0.03895477]], dtype=float32)>

In [1108]:
embedding_matrix_USE_train = np.array(embeddings_USE_train)
embedding_matrix_USE_valid = np.array(embeddings_USE_valid)

In [1109]:
time_use = time.time() - t
case= {'Embedding': "Univ. Sentence Encoder",
       'Time': time_use}    
times_embed.append(case)

In [1110]:
embedding_matrix_USE_train

array([[-0.03905287, -0.07600362,  0.02124164, ...,  0.04003076,
         0.00880638,  0.0135172 ],
       [ 0.05754621,  0.02805524, -0.03187463, ...,  0.05339389,
         0.06803529, -0.02249984],
       [-0.00135626,  0.04400793, -0.01373925, ...,  0.02239931,
        -0.03765979,  0.07158894],
       ...,
       [-0.02539974, -0.05343602,  0.0577434 , ...,  0.01711884,
         0.01079335, -0.00586297],
       [-0.03075003, -0.04403489,  0.01170156, ...,  0.07016593,
        -0.05456427,  0.0272686 ],
       [ 0.02614759, -0.05074428,  0.00971   , ...,  0.01864268,
         0.05439513,  0.03895477]], dtype=float32)

#### Saving the encoded information

In [1002]:
# saving the embeddings with USE
np.save("../data/embeddings/embedding_matrix_USE_train", embedding_matrix_USE_train)
np.save("../data/embeddings/embedding_matrix_USE_valid", embedding_matrix_USE_valid)

### BERT encoder

In [1003]:
# BERT
t = time.time()
bert_encoder = SentenceTransformer('bert-large-nli-mean-tokens')

In [1004]:
embeddings_BERT_train = bert_encoder.encode(X_train_Q1_pp2)
embeddings_BERT_valid = bert_encoder.encode(X_valid_Q1_pp2)

In [1005]:
# Themes - Vectorize preprocessed X train pre-processed dataset
embedding_matrix_BERT_train = np.asarray(embeddings_BERT_train)
embedding_matrix_BERT_valid = np.asarray(embeddings_BERT_valid)

In [1006]:
time_bert = time.time() - t
case= {'Embedding': "BERT",
       'Time': time_bert}    
times_embed.append(case)

In [1007]:
embedding_matrix_BERT_train.shape

(10376, 1024)

In [1008]:
embedding_matrix_BERT_train

array([[ 1.4034353 , -0.03421297,  0.6793261 , ..., -0.97726643,
        -0.38727477, -0.11952855],
       [ 0.44806713,  0.3930342 ,  0.5015124 , ..., -0.3754626 ,
        -0.36841637, -0.25009534],
       [ 0.4140735 , -0.41724584,  0.50169885, ...,  0.05167035,
        -0.11021139, -0.57716227],
       ...,
       [ 0.50822234, -0.2311309 ,  0.4302179 , ..., -0.7286546 ,
        -0.24566731,  0.6600919 ],
       [ 0.28324887, -0.0622227 ,  0.19700417, ..., -0.6093314 ,
        -0.21683584,  0.1300633 ],
       [-0.23951472, -0.00656399,  0.4725369 , ..., -0.02723284,
        -1.3340077 , -0.06195428]], dtype=float32)

#### Saving the encoded information

In [1009]:
# saving the embeddings with BERT
np.save("../data/embeddings/embedding_matrix_BERT_train", embedding_matrix_BERT_train)
np.save("../data/embeddings/embedding_matrix_BERT_valid", embedding_matrix_BERT_valid)

### Running Times for embeddings:

In [1010]:
pd.DataFrame(times_embed)

Unnamed: 0,Embedding,Time
0,Bag of Words,0.382224
1,TF-IDF,0.368383
2,GloVe,28.344412
3,Univ. Sentence Encoder,10.390384
4,BERT,2619.691087


## Format

### `y` as array

In [1011]:
# # convert all y to an array
# y_train = (np.array(y_train_Q1))
# y_valid = (np.array(y_valid_Q1))

# ### DO WE NEED THIS ONE??? I THINK WE DON'T USE 

### `y` by themes and sub-themes

In [1012]:
# Slice `y` to themes and subthemes.
# Note: we never use 'Unrelated' as Theme netiher Sub-theme

#y_train
y_train_thm = y_train_Q1.loc[:, 'CPD':'OTH']
y_train_sub = y_train_Q1.loc[:, 'CPD_Improve_new_employee_orientation':'OTH_Covid']

#y_valid
y_valid_thm = y_valid_Q1.loc[:, 'CPD':'OTH']
y_valid_sub = y_valid_Q1.loc[:, 'CPD_Improve_new_employee_orientation':'OTH_Covid']


# y's as arrays
y_train_thm = (np.array(y_train_thm))
y_train_sub = (np.array(y_train_sub))
y_valid_thm = (np.array(y_valid_thm))
y_valid_sub = (np.array(y_valid_sub))

#shape check: 13 themes and 62 subthemes
print('Theme columns:',y_train_thm.shape[1], ", shape:", np.shape(y_train_thm))
print('Subtheme columns:', y_train_sub.shape[1], ", shape:", np.shape(y_train_sub))

Theme columns: 12 , shape: (10376, 12)
Subtheme columns: 62 , shape: (10376, 62)


### Saving `y` as numpy objects

In [1013]:
# saving THEME and SUB-THEME targets
np.save("../data/embeddings/y_train_thm", y_train_thm)
np.save("../data/embeddings/y_train_sub", y_train_sub)
np.save("../data/embeddings/y_valid_thm", y_valid_thm)
np.save("../data/embeddings/y_valid_sub", y_valid_sub)

In [None]:
############################################################
###                                                      ###
###   SECTION WITH BASIC MODELS, DON'T RUN AFTER THIS.   ###
###                                                      ###
############################################################

## Modelling with CNN - only for themes

### Model: Glove + CNN

In [1014]:
max_features = embedding_matrix_GLOVE.shape[0]
maxlen = max_len
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 1
embed_size = 300 # for glove we are using 300d dataset
n_class = 12

In [1015]:
model1 = Sequential()

model1.add(Embedding(max_features, embed_size, weights=[embedding_matrix_GLOVE],
                    trainable=False, input_length=maxlen))
model1.add(Dropout(0.2))
model1.add(Conv1D(filters, kernel_size, padding='valid', activation='relu',
                 strides=1))
model1.add(MaxPooling1D())
model1.add(Conv1D(filters, kernel_size, padding='valid',activation='relu'))
model1.add(MaxPooling1D())
model1.add(Flatten())
model1.add(Dense(hidden_dims, activation = 'relu'))
model1.add(Dropout(0.5))
model1.add(Dense(n_class, activation = 'sigmoid'))

model1.summary()

Model: "sequential_98"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_31 (Embedding)     (None, 87, 300)           2591700   
_________________________________________________________________
dropout_97 (Dropout)         (None, 87, 300)           0         
_________________________________________________________________
conv1d_84 (Conv1D)           (None, 85, 250)           225250    
_________________________________________________________________
max_pooling1d_87 (MaxPooling (None, 42, 250)           0         
_________________________________________________________________
conv1d_85 (Conv1D)           (None, 40, 250)           187750    
_________________________________________________________________
max_pooling1d_88 (MaxPooling (None, 20, 250)           0         
_________________________________________________________________
flatten_47 (Flatten)         (None, 5000)            

In [1016]:
model1.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
model1.fit(padded_docs_train, y_train_thm, batch_size=batch_size, epochs=epochs,
          validation_split=0.15)

Train on 8819 samples, validate on 1557 samples


<tensorflow.python.keras.callbacks.History at 0x1aaa790a90>

#### Evaluation on Validation data

In [1017]:
# from tqdm import tqdm
# encoded_docs_valid = vect.texts_to_sequences(X_valid_Q1['Comment'])
# padded_docs_valid = pad_sequences(encoded_docs_valid, maxlen=max_len, padding='post')

score = model1.evaluate(padded_docs_valid, y_valid_thm)
score



[0.2939400156369647, 0.8960099]

#### Precision & Recall

In [1018]:
predictions = model1.predict(padded_docs_valid, batch_size=batch_size, verbose=1)



In [1019]:
predictions

array([[0.13142534, 0.12893617, 0.12386531, ..., 0.04853222, 0.1706842 ,
        0.04150677],
       [0.12033603, 0.19773522, 0.09779354, ..., 0.0637963 , 0.13004135,
        0.04049535],
       [0.04034825, 0.01286739, 0.02769243, ..., 0.93513274, 0.06735138,
        0.02334217],
       ...,
       [0.18681641, 0.16545677, 0.10535166, ..., 0.1027528 , 0.16285451,
        0.03809162],
       [0.15256792, 0.3459331 , 0.07776302, ..., 0.06837911, 0.18199798,
        0.02498189],
       [0.06668898, 0.08049981, 0.06052296, ..., 0.35278776, 0.08763526,
        0.04004364]], dtype=float32)

In [1020]:
predictions_glove = []
thresholds=np.arange(0, 1, 0.05).tolist()

for val in thresholds:
    pred=predictions.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    precision = precision_score(y_valid_thm, pred, average='micro')
    recall = recall_score(y_valid_thm, pred, average='micro')
    f1 = f1_score(y_valid_thm, pred, average='micro')
   
    case= {'Threshold': val,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_glove.append(case)

print("Micro-average quality numbers:")
pd.DataFrame(predictions_glove)

Micro-average quality numbers:


Unnamed: 0,Threshold,Precision,Recall,F1-measure
0,0.0,0.11623,1.0,0.208254
1,0.05,0.145198,0.964898,0.252413
2,0.1,0.185112,0.877004,0.305699
3,0.15,0.2506,0.692095,0.367965
4,0.2,0.358316,0.470426,0.406788
5,0.25,0.502216,0.313156,0.385768
6,0.3,0.672457,0.22471,0.336855
7,0.35,0.787289,0.167772,0.276601
8,0.4,0.847682,0.141515,0.242539
9,0.45,0.867647,0.130459,0.226814


### Model: USE + CNN

In [1021]:
max_features = embedding_matrix_USE_train.shape[0]
maxlen = max_len
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 1
embed_size = 512 # for universal sentence encoder
n_class = 12

In [1022]:
max_features
maxlen

87

In [1023]:
# Since the values are normalized, the inner product of encodings
# can be treated as a SIMILARITY MATRIX. This show us the similarity
# between comments.
print("Shape of the EMBEDDING MATRIX:", embedding_matrix_USE_train.shape)
similarity_matrix_USE = np.inner(embedding_matrix_USE_train, embedding_matrix_USE_train)
print("Shape of SIMILARITY MATRIX:", similarity_matrix_USE.shape)
similarity_matrix_USE

Shape of the EMBEDDING MATRIX: (10376, 512)
Shape of SIMILARITY MATRIX: (10376, 10376)


array([[ 0.99999994,  0.03254884,  0.02044772, ...,  0.19620545,
         0.18911791,  0.02506309],
       [ 0.03254884,  0.99999976,  0.06175764, ...,  0.04994562,
         0.08648334, -0.03285146],
       [ 0.02044772,  0.06175764,  0.9999999 , ...,  0.11044698,
         0.0220431 ,  0.08513585],
       ...,
       [ 0.19620545,  0.04994562,  0.11044698, ...,  0.99999976,
         0.12881677,  0.04329847],
       [ 0.18911791,  0.08648334,  0.0220431 , ...,  0.12881677,
         1.0000004 ,  0.01171911],
       [ 0.02506309, -0.03285146,  0.08513585, ...,  0.04329847,
         0.01171911,  0.9999997 ]], dtype=float32)

In [1024]:
# #TOY EXAMPLE 1
# model2 = Sequential()
# model2.add(Dense(128, activation = 'relu'))
# model2.add(Dense(n_class, activation = 'softmax'))

# model2.build((None, max_features, embed_size))
# model2.summary()

In [1025]:
# #TOY EXAMPLE 2
# model2 = Sequential()
# model2.add(Dense(128, activation = 'relu'))

# model2.add(Dropout(0.2))
# # model2.add(Conv1D(filters, kernel_size, padding='valid', activation='relu',
# #                  strides=1))
# # model2.add(MaxPooling1D())
# # model2.add(Conv1D(filters, kernel_size, padding='valid',activation='relu'))
# # model2.add(MaxPooling1D())
# # model2.add(Flatten())
# model2.add(Dense(hidden_dims, activation = 'relu'))
# model2.add(Dropout(0.5))
# model2.add(Dense(n_class, activation = 'softmax'))

# model2.build((None, max_features, embed_size))
# model2.summary()

In [1026]:
# TOY EXAMPLE 3
model2 = Sequential()
model2.add(Dense(max_features, input_shape=(embed_size,)))
model2.add(Activation('relu'))
model2.add(Dropout(0.5))
model2.add(Dense(n_class))
model2.add(Activation('sigmoid'))

model2.summary()

Model: "sequential_99"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_193 (Dense)            (None, 10376)             5322888   
_________________________________________________________________
activation_19 (Activation)   (None, 10376)             0         
_________________________________________________________________
dropout_99 (Dropout)         (None, 10376)             0         
_________________________________________________________________
dense_194 (Dense)            (None, 12)                124524    
_________________________________________________________________
activation_20 (Activation)   (None, 12)                0         
Total params: 5,447,412
Trainable params: 5,447,412
Non-trainable params: 0
_________________________________________________________________


In [1027]:
model2.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
model2.fit(embedding_matrix_USE_train, y_train_thm, batch_size=batch_size, epochs=epochs,
          validation_split=0.15)

Train on 8819 samples, validate on 1557 samples


<tensorflow.python.keras.callbacks.History at 0x1a828a7690>

#### Evaluation on Validation data

In [1028]:
score = model2.evaluate(embedding_matrix_USE_valid, y_valid_thm)
score



[0.20086015897617032, 0.9242805]

#### Precision & Recall

In [1029]:
predictions = model2.predict(embedding_matrix_USE_valid, batch_size=batch_size, verbose=1)



In [1030]:
predictions

array([[0.06698422, 0.00153247, 0.00859099, ..., 0.00305651, 0.3634413 ,
        0.02548531],
       [0.01808216, 0.00555592, 0.23882768, ..., 0.01634414, 0.0769001 ,
        0.05658474],
       [0.03724623, 0.01348698, 0.09376555, ..., 0.9153991 , 0.05189299,
        0.15749778],
       ...,
       [0.0523813 , 0.10979683, 0.03114   , ..., 0.17261714, 0.01866454,
        0.04359911],
       [0.11188969, 0.954266  , 0.02390652, ..., 0.01514343, 0.01504079,
        0.02143048],
       [0.09486086, 0.00264859, 0.01919152, ..., 0.0058036 , 0.24384539,
        0.0464747 ]], dtype=float32)

In [1051]:
predictions_use = []
thresholds=np.arange(0, 1, 0.05).tolist()

model2.build((None, max_features, embed_size))
for val in thresholds:
    pred=predictions.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    precision = precision_score(y_valid_thm, pred, average='micro')
    recall = recall_score(y_valid_thm, pred, average='micro')
    f1 = f1_score(y_valid_thm, pred, average='micro')
   
    case= {'Threshold': val,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_use.append(case)

print("\nMicro-average quality numbers:")
pd.DataFrame(predictions_use)


Micro-average quality numbers:


Unnamed: 0,Threshold,Precision,Recall,F1-measure
0,0.0,0.11623,1.0,0.208254
1,0.05,0.266344,0.927861,0.413882
2,0.1,0.376007,0.851575,0.521673
3,0.15,0.456157,0.775014,0.574296
4,0.2,0.523263,0.705638,0.600918
5,0.25,0.57857,0.646213,0.610524
6,0.3,0.63396,0.596462,0.61464
7,0.35,0.686051,0.547816,0.60919
8,0.4,0.723817,0.494748,0.587752
9,0.45,0.763405,0.44859,0.565111


### Model: BERT + CNN

In [1052]:
max_features = embedding_matrix_BERT_train.shape[0]
maxlen = max_len
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 1
embed_size = 1024 # for BERT Large
n_class = 12

In [1053]:
# model3 = Sequential()
# # model3.add(Embedding(max_features, embed_size, #weights=[embedding_matrix_USE],
# #                         trainable=False, input_length=embed_size))#maxlen))
# #model3.add(Dropout(0.2))
# model3.add(Conv1D(filters, kernel_size, padding='valid', activation='relu',
#                  strides=1))
# model3.add(MaxPooling1D())
# model3.add(Conv1D(filters, kernel_size, padding='valid',activation='relu'))
# model3.add(MaxPooling1D())
# model3.add(Flatten())
# model3.add(Dense(hidden_dims, activation = 'relu'))
# model3.add(Dropout(0.5))
# model3.add(Dense(n_class, activation = 'sigmoid'))
# model3.summary()


model3 = Sequential()
model3.add(Dense(max_features, input_shape=(embed_size,)))
model3.add(Activation('relu'))
model3.add(Dropout(0.5))
model3.add(Dense(n_class))
model3.add(Activation('sigmoid'))

model3.summary()

Model: "sequential_101"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_197 (Dense)            (None, 10376)             10635400  
_________________________________________________________________
activation_23 (Activation)   (None, 10376)             0         
_________________________________________________________________
dropout_101 (Dropout)        (None, 10376)             0         
_________________________________________________________________
dense_198 (Dense)            (None, 12)                124524    
_________________________________________________________________
activation_24 (Activation)   (None, 12)                0         
Total params: 10,759,924
Trainable params: 10,759,924
Non-trainable params: 0
_________________________________________________________________


In [1054]:
model3.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

In [1055]:
# Train Model
model3.fit(embedding_matrix_BERT_train, y_train_thm, batch_size=batch_size, epochs=epochs,
          validation_split=0.15)

Train on 8819 samples, validate on 1557 samples


<tensorflow.python.keras.callbacks.History at 0x1a82da0910>

#### Evaluation on Validation data

In [1056]:
score = model3.evaluate(embedding_matrix_BERT_valid, y_valid_thm)
score



[0.2108168209986955, 0.919558]

In [1066]:
embedding_matrix_BERT_valid

array([[ 0.25941345,  0.36210942,  0.49052876, ..., -0.05945268,
        -0.59524155, -0.42596203],
       [ 0.12129721, -0.91398495,  0.38285968, ..., -0.02158303,
        -0.81365967, -0.6090022 ],
       [ 0.03514948,  0.1167607 ,  0.02573913, ..., -0.64796126,
        -0.8018599 , -0.10790344],
       ...,
       [ 0.55087906,  0.22932735,  0.298204  , ..., -0.34927025,
        -0.6513062 , -0.21662723],
       [ 0.59616655,  0.86357677,  0.5159346 , ..., -0.25301898,
        -0.12649104, -0.02165654],
       [ 0.13343003, -0.38848475,  0.1278247 , ..., -0.4058949 ,
        -0.1268155 , -0.28904745]], dtype=float32)

#### Precision & Recall

In [1057]:
predictions = model3.predict(embedding_matrix_BERT_valid, batch_size=batch_size, verbose=1)



In [1059]:
predictions_bert = []
thresholds=np.arange(0, 1, 0.05).tolist()

for val in thresholds:
    pred=predictions.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    precision = precision_score(y_valid_thm, pred, average='micro')
    recall = recall_score(y_valid_thm, pred, average='micro')
    f1 = f1_score(y_valid_thm, pred, average='micro')
   
    case= {'Threshold': val,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_bert.append(case)

print("\nMicro-average quality numbers:")
pd.DataFrame(predictions_bert)


Micro-average quality numbers:


Unnamed: 0,Threshold,Precision,Recall,F1-measure
0,0.0,0.11623,1.0,0.208254
1,0.05,0.25806,0.927032,0.403732
2,0.1,0.360747,0.843836,0.505422
3,0.15,0.43904,0.773355,0.560104
4,0.2,0.509683,0.705638,0.591863
5,0.25,0.564066,0.647319,0.602831
6,0.3,0.610538,0.605307,0.607911
7,0.35,0.659734,0.561083,0.606423
8,0.4,0.698666,0.521006,0.596897
9,0.45,0.724801,0.478994,0.576801


## USE + CNN -> Varada's model

In [1115]:
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
# import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
# import seaborn as sns
import tensorflow_datasets as tfds
from tensorflow.keras import Input, layers
from tensorflow.keras.models import Model

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
    return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [1157]:
# load comments
X_train = pd.read_csv('../data/X_train.csv')['Comment'].to_numpy()
X_valid = pd.read_csv('../data/X_valid.csv')['Comment'].to_numpy()

# load labels
y_train = pd.read_csv('../data/y_train.csv')
y_valid = pd.read_csv('../data/y_valid.csv')
# select only themes' labels
y_train = y_train.loc[:, 'CPD':'OTH'].to_numpy()
y_valid = y_valid.loc[:, 'CPD':'OTH'].to_numpy()

In [1158]:
print("Training entries: {}, test entries: {}".format(len(X_train), len(X_valid)))

Training entries: 10376, test entries: 2594


In [1159]:
print("X_train   ", "y_train\n",
      X_train.shape,
      y_train.shape,
      "\n\nX_valid   ", "y_valid\n",
      X_valid.shape,
      y_valid.shape)

X_train    y_train
 (10376,) (10376, 12) 

X_valid    y_valid
 (2594,) (2594, 12)


In [1160]:
X_train[:2]
# type(X_train)

array(['to be real about diversity, you need to create seats at the table. That means affirmative action type programs.  We still see men advancing in leadership at MCFD, then white women, people of color are left behind and have token representation ',
       'Keep the building warmer and provide warm water in the bathroom.'],
      dtype=object)

In [1161]:
y_train

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [1162]:
### Use embeddings given by universal sentence encoder 
model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
hub_layer = hub.KerasLayer(model, output_shape=[512], input_shape=[], 
                           dtype=tf.string, trainable=True)

In [1163]:
hub_layer(X_train[:3]).shape

TensorShape([3, 512])

In [1164]:
# Let's build a CNN on the top of USE embeddings. The difference
# with Varada model is that we already will give the embeddings.
input = Input(shape=(), name="Input", dtype=tf.string)
x = hub_layer(input)
x = tf.keras.layers.Reshape(input_shape=(512,), target_shape=(512, 1))(x)
x = tf.keras.layers.Conv1D(128, 2, activation='relu', padding='same')(x)
x = tf.keras.layers.MaxPooling1D(5, padding='same')(x)
x = tf.keras.layers.Conv1D(128, 3, activation='relu', padding='same')(x)
x = tf.keras.layers.MaxPooling1D(5, padding='same')(x)
x = tf.keras.layers.Conv1D(128, 4, activation='relu', padding='same')(x)
x = tf.keras.layers.MaxPooling1D(40, padding='same')(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
output = tf.keras.layers.Dense(12, activation='sigmoid')(x)
m = Model(input, output)
m.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           [(None,)]                 0         
_________________________________________________________________
keras_layer_1 (KerasLayer)   (None, 512)               256797824 
_________________________________________________________________
reshape_2 (Reshape)          (None, 512, 1)            0         
_________________________________________________________________
conv1d_92 (Conv1D)           (None, 512, 128)          384       
_________________________________________________________________
max_pooling1d_95 (MaxPooling (None, 103, 128)          0         
_________________________________________________________________
conv1d_93 (Conv1D)           (None, 103, 128)          49280     
_________________________________________________________________
max_pooling1d_96 (MaxPooling (None, 21, 128)           0   

In [1167]:
m.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [1168]:
history = m.fit(X_train,
                y_train,
                epochs=10, #20,
                batch_size=512,
                validation_data=(X_valid, y_valid),
                verbose=1)

Train on 10376 samples, validate on 2594 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [1169]:
# plot
import altair as alt
# alt.renderers.enable('default')
# alt.data_transformers.enable('json')

# funtion to plot
def plot_metrics(hist):
    '''
    Returns an Altair plot of the loss and accuracy for the train and 
    validation datasets based in the history of the model

    Input:
    ------
    hist (object) tensorflow.python.keras.callbacks.History

    Output:
    -------
    Altair plot
    '''
    df = pd.DataFrame(hist.history.values(), hist.history.keys())\
        .T.rename(columns={"loss":"train_loss",
                          "accuracy":"train_accuracy",
                          "val_loss":"valid_loss",
                          "val_accuracy":"valid_accuracy"})
    df = pd.DataFrame(df.stack()).reset_index().drop(columns=[])\
        .rename(columns={"level_0":'epoch', 'level_1':'metric', 0:'value'})
    plot = alt.Chart(df).mark_line().encode(
        x='epoch:Q',
        y='value:Q',
        color='metric'
    ).properties(
        title='Loss and Accuracy'
    )
    return plot

plot_metrics(history)

In [1171]:
# HYPERPARAMS
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, hamming_loss

max_features = X_train.shape[0] # comments
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 20
embed_size = 512 # for USE
n_class = 12 # 12 for themes and 62 for sub-themes

# PREDICTIONS
# get the prediction
y_pred = m.predict(X_valid, batch_size=batch_size, verbose=1)
# rounding the predictions
y_pred_binary = (y_pred > 0.5) * 1


# PRECISION & RECALL
predictions_results = []
thresholds=np.arange(.5, 1, 0.1).tolist()

for val in thresholds:
    pred = y_pred.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    accuracy = accuracy_score(y_valid, pred, normalize=True, sample_weight=None)
    res = []
    for i in range(0, y_valid.shape[1]):
        res.append(accuracy_score(y_valid[:,i], y_pred_binary[:,i]))
    accuracy_keras = np.mean(res)
    hamming = hamming_loss(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Accuracy all model': accuracy,
           'Accuracy average (keras)': accuracy_keras,
           'Hamming loss': hamming,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("Micro-average quality numbers:")
pd.DataFrame(predictions_results)

Micro-average quality numbers:


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Threshold,Accuracy all model,Accuracy average (keras),Hamming loss,Precision,Recall,F1-measure
0,0.5,0.002699,0.88377,0.11623,0.0,0.0,0.0
1,0.6,0.002699,0.88377,0.11623,0.0,0.0,0.0
2,0.7,0.002699,0.88377,0.11623,0.0,0.0,0.0
3,0.8,0.002699,0.88377,0.11623,0.0,0.0,0.0
4,0.9,0.002699,0.88377,0.11623,0.0,0.0,0.0


In [1172]:
# RESULTS PER LABEL
# Last's year function
def theme_results(Ytrue, Ypred):
    '''Calculate accuracies for theme classification
    Parameters
    ----------
    Ytrue : array of shape (n_obeservations, n_labels)
        Correct labels for the 12 text classifications
    Ypred : array of shape (n_obeservations, n_labels)
        Predicted labels for the 12 text classifications
    Returns
    -------
    overall_results : dataframes of overall evaluation metrics
    theme_results : dataframe of evaluation metrics by class
    '''
    # Calculate individual accuracies and evaluation metrics for each class
    labels = ['CPD', 'CB', 'EWC', 'Exec', 'FWE', 'SP', 'RE', 'Sup', 'SW',
              'TEPE', 'VMG', 'OTH']
    Y_count = []
    pred_count = []
    error = []
    #dummy_diff = []
    accuracies = []
    precision = []
    recall = []
    for i in np.arange(Ytrue.shape[1]):
        Y_count.append(np.sum(Ytrue[:, i] == 1))
        pred_count.append(np.sum(Ypred[:, i] == 1))
        error.append(1 - accuracy_score(Ytrue[:, i], Ypred[:, i]))
        #dummy_diff.append((np.mean(Ytrue[:, i] == 1)) - error[i])
        accuracies.append(accuracy_score(Ytrue[:, i], Ypred[:, i]))
        precision.append(precision_score(Ytrue[:, i], Ypred[:, i]))
        recall.append(recall_score(Ytrue[:, i], Ypred[:, i]))
    theme_results = pd.DataFrame({'Label': labels,
                                  'Y_count': Y_count,
                                  'Pred_count': pred_count,
                                  'Error': error,
                                 # 'Dummy_Diff': dummy_diff,
                                  'Accuarcy': accuracies,
                                  'Precision': precision,
                                  'Recall': recall})
    return theme_results


print(np.mean(theme_results(y_valid, y_pred_binary)['Accuarcy']))
theme_results(y_valid, y_pred_binary)

0.8837702390131073


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Label,Y_count,Pred_count,Error,Accuarcy,Precision,Recall
0,CPD,344,0,0.132614,0.867386,0.0,0.0
1,CB,317,0,0.122205,0.877795,0.0,0.0
2,EWC,231,0,0.089052,0.910948,0.0,0.0
3,Exec,353,0,0.136083,0.863917,0.0,0.0
4,FWE,187,0,0.072089,0.927911,0.0,0.0
5,SP,252,0,0.097147,0.902853,0.0,0.0
6,RE,204,0,0.078643,0.921357,0.0,0.0
7,Sup,258,0,0.09946,0.90054,0.0,0.0
8,SW,396,0,0.15266,0.84734,0.0,0.0
9,TEPE,605,0,0.233231,0.766769,0.0,0.0
