# Load dependencies

In [1]:
# LOAD DEPENDENCIES
# General
import pandas as pd
import numpy as np
import time
import tensorflow as tf
import tensorflow_hub as tensorflow_hub
from tensorflow import keras

# Embeddings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer  #TD-IDF & Bag of Words
from tensorflow.keras.preprocessing.text import Tokenizer  #GloVe
# from sentence_transformers import SentenceTransformer  #BERT

# Models
from keras.preprocessing.sequence import pad_sequences #Glove of CNN
# from keras.models import Sequential #Glove of CNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation, Reshape
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling2D, GlobalMaxPooling1D, MaxPool1D, MaxPooling1D
from tensorflow.keras.layers import Embedding, LSTM, Input, Lambda, InputLayer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, hamming_loss

# To use GPU-Accelerated Machine Learning on MacOS
# import os
# os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

# plot
import altair as alt
# alt.renderers.enable('default')
# alt.renderers.enable('mimetype')
# alt.data_transformers.enable('json')

Using TensorFlow backend.


In [None]:
# funtion to plot
def plot_metrics(hist):
  '''
  Returns an Altair plot of the loss and accuracy for the train and 
  validation datasets based in the history of the model

  Input:
  ------
  hist (object) tensorflow.python.keras.callbacks.History

  Output:
  -------
  Altair plot
  '''
  df = pd.DataFrame(hist.history.values(), hist.history.keys())\
        .T.rename(columns={"loss":"train_loss",
                          "accuracy":"train_accuracy",
                          "val_loss":"valid_loss",
                          "val_accuracy":"valid_accuracy"})
  df = pd.DataFrame(df.stack()).reset_index().drop(columns=[])\
        .rename(columns={"level_0":'epoch', 'level_1':'metric', 0:'value'})
  plot = alt.Chart(df).mark_line().encode(
      x='epoch:Q',
      y='value:Q',
      color='metric'
  ).properties(
    title='Loss and Accuracy'
  )
  return plot

In [None]:
# Last's year function
def theme_results(Ytrue, Ypred):
    '''Calculate accuracies for theme classification
    Parameters
    ----------
    Ytrue : array of shape (n_obeservations, n_labels)
        Correct labels for the 12 text classifications
    Ypred : array of shape (n_obeservations, n_labels)
        Predicted labels for the 12 text classifications
    Returns
    -------
    overall_results : dataframes of overall evaluation metrics
    theme_results : dataframe of evaluation metrics by class
    '''
    # Calculate individual accuracies and evaluation metrics for each class
    labels = ['CPD', 'CB', 'EWC', 'Exec', 'FWE', 'SP', 'RE', 'Sup', 'SW',
              'TEPE', 'VMG', 'OTH']
    Y_count = []
    pred_count = []
    error = []
    #dummy_diff = []
    accuracies = []
    precision = []
    recall = []
    for i in np.arange(Ytrue.shape[1]):
        Y_count.append(np.sum(Ytrue[:, i] == 1))
        pred_count.append(np.sum(Ypred[:, i] == 1))
        error.append(1 - accuracy_score(Ytrue[:, i], Ypred[:, i]))
        #dummy_diff.append((np.mean(Ytrue[:, i] == 1)) - error[i])
        accuracies.append(accuracy_score(Ytrue[:, i], Ypred[:, i]))
        precision.append(precision_score(Ytrue[:, i], Ypred[:, i]))
        recall.append(recall_score(Ytrue[:, i], Ypred[:, i]))
    theme_results = pd.DataFrame({'Label': labels,
                                  'Y_count': Y_count,
                                  'Pred_count': pred_count,
                                  'Error': error,
                                 # 'Dummy_Diff': dummy_diff,
                                  'Accuarcy': accuracies,
                                  'Precision': precision,
                                  'Recall': recall})
    return theme_results

# GloVe - CNN

## Load datasets (Glove)

In [None]:
# LOAD DEPENDENCIES
import os
#os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
import keras
from keras.datasets import mnist
from keras import models
from keras.models import Sequential
from keras import layers
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv1D, Conv2D, MaxPooling2D, GlobalMaxPooling1D, MaxPool1D, MaxPooling1D, GlobalMaxPool1D, SpatialDropout1D, GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM
from keras.utils import to_categorical
import tensorflow as tf
from keras import backend as K

import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import spacy
# from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, hamming_loss
from tensorflow.keras.metrics import Recall, Precision


In [6]:
#LOAD DATASETS
from google.colab import drive
drive.mount('/content/drive')

root = "/content/drive/My Drive/Colab Notebooks/models_mds/data/"
X_train = np.load(root + "padded_docs_train.npy")
X_valid = np.load(root + "padded_docs_valid.npy")
glove_embeddings = np.load(root + "embedding_matrix_GLOVE.npy")

# retrieve targets
y_train = np.load(root + "y_train_thm.npy") #"y_train_sub.npy")
y_valid = np.load(root + "y_valid_thm.npy") #"y_train_sub.npy")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# SHAPES
print("glove_embeddings", glove_embeddings.shape,
      "\n\nX_train", X_train.shape,
      "\ny_train", y_train.shape,
      "\n\nX_valid", X_valid.shape,
      "\ny_valid", y_valid.shape)

glove_embeddings (8639, 300) 

X_train (10376, 87) 
y_train (10376, 12) 

X_valid (2594, 87) 
y_valid (2594, 12)


## Based in Karan's model

In [None]:
# MODEL

max_features = glove_embeddings.shape[0] # vocabulary
maxlen = 87 # longest comment
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 20
embed_size = 300 # glove 300d dataset
n_class = 12

In [9]:
model = Sequential()

model.add(Embedding(max_features, embed_size, weights=[glove_embeddings], trainable=False, input_length=maxlen))

model.add(Dropout(0.2))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu',
                 strides=1))
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, padding='valid',activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())

# L2 regularization
model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=keras.regularizers.l2(0.001)))
model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=keras.regularizers.l2(0.001)))
model.add(Dense(n_class, activation = 'sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 87, 300)           2591700   
_________________________________________________________________
dropout_1 (Dropout)          (None, 87, 300)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 85, 250)           225250    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 42, 250)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 40, 250)           187750    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 20, 250)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)             

In [10]:
from sklearn.utils import class_weight
model.compile(loss='binary_crossentropy', optimizer='adam', 
              metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, 
          class_weight='auto', validation_data=(X_valid, y_valid))#validation_split=0.15)

Train on 10376 samples, validate on 2594 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7f4a04244c50>

### Comparing accuracy

In [11]:
# KERAS ACCURACY FOR VALIDATION DATASET
score = model.evaluate(X_valid, y_valid)
score[1]



0.9280391931533813

This result are the same to the results from the previous run for the validation dataset.

In [12]:
# KERAS ACCURACY MANUALLY OBTAINED
manual_obtained = (model.predict(X_valid) > 0.5)
np.mean(manual_obtained==y_valid)

0.9280390645078386

In [13]:
# PREDICTIONS
# get the prediction
y_pred = model.predict(X_valid, batch_size=batch_size, verbose=1)
# rounding the predictions
y_pred_binary = (y_pred > 0.5) * 1
# obtaining the Scikit-learn's accuracy for each label
res = []
for i in range(0, y_valid.shape[1]):
  res.append(accuracy_score(y_valid[:,i], y_pred_binary[:,i]))
# getting the mean
same = np.mean(res)
print(same, "this number should be the same as the Keras' accuracy for validation dataset")

0.9280390645078386 this number should be the same as the Keras' accuracy for validation dataset


In [14]:
# SCIKIT-LEARN METRICS FOR THE WHOLE MODEL
sklearn_accuracy = accuracy_score(y_valid, y_pred_binary)
print('sklearn_accuracy', sklearn_accuracy)

sklearn_accuracy 0.4637625289128759


In [19]:
# PREDICTIONS
# get the prediction
y_pred = model.predict(X_valid, batch_size=batch_size, verbose=1)
# rounding the predictions
y_pred_binary = (y_pred > 0.5) * 1

# PRECISION & RECALL

predictions_results = []
thresholds=np.arange(.5, 1, 0.1).tolist()

for val in thresholds:
    pred = y_pred.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    accuracy = accuracy_score(y_valid, pred, normalize=True, sample_weight=None)
    res = []
    for i in range(0, y_valid.shape[1]):
       res.append(accuracy_score(y_valid[:,i], y_pred_binary[:,i]))
    accuracy_keras = np.mean(res)
    hamming = hamming_loss(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Accuracy all model': accuracy,
           'Accuracy average (keras)': accuracy_keras,
           'Hamming loss': hamming,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("Micro-average quality numbers:")
pd.DataFrame(predictions_results)

Micro-average quality numbers:


Unnamed: 0,Threshold,Accuracy all model,Accuracy average (keras),Hamming loss,Precision,Recall,F1-measure
0,0.5,0.463763,0.928039,0.071961,0.705182,0.654505,0.678899
1,0.6,0.475328,0.928039,0.068877,0.740222,0.627695,0.67933
2,0.7,0.472244,0.928039,0.067753,0.769176,0.595909,0.671546
3,0.8,0.468003,0.928039,0.066724,0.805149,0.561913,0.661892
4,0.9,0.457209,0.928039,0.067014,0.849772,0.514373,0.64084


In [20]:
# RESULTS PER LABEL
theme_results(y_valid, y_pred_binary)

Unnamed: 0,Label,Y_count,Pred_count,Error,Accuarcy,Precision,Recall
0,CPD,344,349,0.076715,0.923285,0.707736,0.718023
1,CB,317,345,0.046261,0.953739,0.785507,0.85489
2,EWC,231,132,0.062066,0.937934,0.765152,0.437229
3,Exec,353,399,0.101773,0.898227,0.611529,0.691218
4,FWE,187,177,0.037008,0.962992,0.757062,0.716578
5,SP,252,312,0.072475,0.927525,0.602564,0.746032
6,RE,204,166,0.070933,0.929067,0.560241,0.455882
7,Sup,258,185,0.075173,0.924827,0.67027,0.48062
8,SW,396,375,0.101388,0.898612,0.677333,0.641414
9,TEPE,605,575,0.062452,0.937548,0.885217,0.841322


# USE - CNN
*USE: Universal Sentence Encoder*

## Load datasets (USE)

In [32]:
# LOAD DATASETS
from google.colab import drive
drive.mount('/content/drive')

root = "/content/drive/My Drive/Colab Notebooks/models_mds/data/"
X_train = np.load(root + "embedding_matrix_USE_train.npy")
X_valid = np.load(root + "embedding_matrix_USE_valid.npy")

# retrieve targets
y_train = np.load(root + "y_train_thm.npy") #"y_train_sub.npy")
y_valid = np.load(root + "y_valid_thm.npy") #"y_valid_sub.npy")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# SHAPES
print("X_train", X_train.shape,
      "\ny_train", y_train.shape,
      "\n\nX_valid", X_valid.shape,
      "\ny_valid", y_valid.shape)

X_train (10376, 512) 
y_train (10376, 12) 

X_valid (2594, 512) 
y_valid (2594, 12)


## Dense model

In [None]:
max_features = X_train.shape[0] # =10376
# maxlen = padded_docs_train.shape[1] # =150
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 50
embed_size = 512 # for universal sentence encoder
n_class = 12

In [29]:
model = Sequential()
model.add(Dense(max_features, input_shape=(embed_size,), activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
# model.add(Dense(hidden_dims, activation = 'relu', 
#                 kernel_regularizer=tf.keras.regularizers.l1(0.01)))
model.add(Dropout(0.4))
model.add(Dense(64, activation='relu'))
model.add(Dense(hidden_dims, activation = 'relu', 
                kernel_regularizer=tf.keras.regularizers.l2(0.01)))
model.add(Dense(128, activation='relu'))
model.add(Dense(n_class, activation='sigmoid'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 10376)             5322888   
_________________________________________________________________
dropout_4 (Dropout)          (None, 10376)             0         
_________________________________________________________________
dense_11 (Dense)             (None, 128)               1328256   
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_13 (Dense)             (None, 250)               16250     
_________________________________________________________________
dense_14 (Dense)             (None, 128)              

In [30]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
                    validation_data=(X_valid, y_valid))

Train on 10376 samples, validate on 2594 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [31]:
plot_metrics(history)

In [32]:
score = model.evaluate(X_valid, y_valid)
score



[0.48812206763897664, 0.9222564697265625]

In [33]:
# PREDICTIONS
# get the prediction
y_pred = model.predict(X_valid, batch_size=batch_size, verbose=1)
# rounding the predictions
y_pred_binary = (y_pred > 0.5) * 1


# PRECISION & RECALL
predictions_results = []
thresholds=np.arange(.5, 1, 0.1).tolist()

for val in thresholds:
    pred = y_pred.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    accuracy = accuracy_score(y_valid, pred, normalize=True, sample_weight=None)
    res = []
    for i in range(0, y_valid.shape[1]):
       res.append(accuracy_score(y_valid[:,i], y_pred_binary[:,i]))
    accuracy_keras = np.mean(res)
    hamming = hamming_loss(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Accuracy all model': accuracy,
           'Accuracy average (keras)': accuracy_keras,
           'Hamming loss': hamming,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("Micro-average quality numbers:")
pd.DataFrame(predictions_results)


Micro-average quality numbers:


Unnamed: 0,Threshold,Accuracy all model,Accuracy average (keras),Hamming loss,Precision,Recall,F1-measure
0,0.5,0.462606,0.927213,0.077744,0.696008,0.587894,0.637399
1,0.6,0.467232,0.926593,0.076266,0.715225,0.57131,0.635218
2,0.7,0.469545,0.926112,0.075109,0.735467,0.552515,0.630997
3,0.8,0.468389,0.925726,0.074402,0.755094,0.532615,0.624635
4,0.9,0.460293,0.925411,0.074306,0.783572,0.498342,0.609225


In [34]:
# RESULTS PER LABEL
theme_results(y_valid, y_pred_binary)


Unnamed: 0,Label,Y_count,Pred_count,Error,Accuarcy,Precision,Recall
0,CPD,344,315,0.075944,0.924056,0.733333,0.671512
1,CB,317,297,0.049345,0.950655,0.818182,0.766562
2,EWC,231,206,0.0798,0.9202,0.558252,0.497835
3,Exec,353,282,0.089052,0.910948,0.716312,0.572238
4,FWE,187,200,0.052043,0.947957,0.63,0.673797
5,SP,252,160,0.064765,0.935235,0.7625,0.484127
6,RE,204,130,0.079414,0.920586,0.492308,0.313725
7,Sup,258,173,0.082884,0.917116,0.624277,0.418605
8,SW,396,255,0.107556,0.892444,0.729412,0.469697
9,TEPE,605,587,0.067078,0.932922,0.867121,0.841322


## Based in Karan's model

In [None]:
# RESHAPE
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_valid = X_valid.reshape(X_valid.shape[0], X_valid.shape[1], 1)

# SHAPES AFTER RESHAPING
print("X_train", X_train.shape,
      "\ny_train", y_train.shape,
      "\n\nX_valid", X_valid.shape,
      "\ny_valid", y_valid.shape)

X_train (10376, 512, 1) 
y_train (10376, 12) 

X_valid (2594, 512, 1) 
y_valid (2594, 12)


In [None]:
# MODEL

max_features = X_train.shape[0] # comments
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 10
embed_size = 512 # for USE
n_class = 12 # 12 for themes and 62 for sub-themes

In [None]:
model = Sequential()
# model.add(Embedding(max_features, embed_size, weights=[glove_embeddings], trainable=False, input_length=maxlen))
# model.add(Conv1D(filters, kernel_size, input_shape=(embed_size,1), padding='valid',
#                  activation='relu', strides=1))
model.add(InputLayer(input_shape=(embed_size, 1)))

model.add(Dropout(0.2))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu',
                 strides=1))
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, padding='valid',activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())

# L2 regularization
model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(Dense(n_class, activation = 'sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 512, 1)            0         
_________________________________________________________________
conv1d (Conv1D)              (None, 510, 250)          1000      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 255, 250)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 253, 250)          187750    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 126, 250)          0         
_________________________________________________________________
flatten (Flatten)            (None, 31500)             0         
_________________________________________________________________
dense (Dense)                (None, 250)               7

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
         validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fccb454d198>

In [None]:
score = model.evaluate(X_valid, y_valid)
score



[0.23982980847358704, 0.5898226499557495]

This result are the same to the results from the previous run for the validation dataset.

In [None]:
# PREDICTIONS
predictions = model.predict(X_valid, batch_size=batch_size, verbose=1)
predictions



array([[0.0366813 , 0.00433928, 0.02611529, ..., 0.01506607, 0.5530528 ,
        0.02992156],
       [0.02273999, 0.00434622, 0.39789572, ..., 0.03834131, 0.22976474,
        0.06598599],
       [0.00565804, 0.0042494 , 0.02848427, ..., 0.7188056 , 0.07095838,
        0.09541337],
       ...,
       [0.06886956, 0.09172915, 0.04480894, ..., 0.22587998, 0.05839407,
        0.04719027],
       [0.14577872, 0.8476381 , 0.05708137, ..., 0.08634239, 0.02931948,
        0.01928886],
       [0.02575373, 0.01254091, 0.0086386 , ..., 0.05569469, 0.4540201 ,
        0.09189149]], dtype=float32)

In [None]:
# PRECISION & RECALL
predictions_results = []
thresholds=np.arange(.5, 1, 0.1).tolist()

for val in thresholds:
    pred=predictions.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    accuracy = accuracy_score(y_valid, pred, normalize=True, sample_weight=None)#average='micro')
    hamming = hamming_loss(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Accuracy': accuracy,
           'Hamming loss': hamming,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("Micro-average quality numbers:")
pd.DataFrame(predictions_results)


Micro-average quality numbers:


Unnamed: 0,Threshold,Precision,Recall,F1-measure
0,0.6,0.836374,0.405473,0.546165
1,0.65,0.860144,0.362078,0.509628
2,0.7,0.885649,0.318961,0.46901
3,0.75,0.907942,0.278054,0.42573
4,0.8,0.930921,0.23466,0.374834
5,0.85,0.953103,0.190989,0.318213
6,0.9,0.97053,0.13654,0.239399
7,0.95,0.992126,0.069652,0.130165


## Model Toy Story 1

In [35]:
# RESHAPE
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])#, 1)
X_valid = X_valid.reshape(X_valid.shape[0], X_valid.shape[1])#, 1)

# SHAPES AFTER RESHAPING
print("X_train", X_train.shape,
      "\ny_train", y_train.shape,
      "\n\nX_valid", X_valid.shape,
      "\ny_valid", y_valid.shape)

X_train (10376, 512) 
y_train (10376, 12) 

X_valid (2594, 512) 
y_valid (2594, 12)


In [None]:
# MODEL

max_features = X_train.shape[0] # comments
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 20
embed_size = 512 # for USE
n_class = 12 # 12 for themes and 62 for sub-themes

In [63]:
model = Sequential()
model.add(Dense(50, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Reshape((10, 2)))
model.add(Conv1D(filters, kernel_size, padding='same',activation='relu'))
model.add(Conv1D(filters, kernel_size=5, padding='valid',activation='relu'))
model.add(Conv1D(7, kernel_size=3))
model.add(Conv1D(4, kernel_size=2, padding='same', activation='relu'))
model.add(Reshape((4, 2, 2)))
model.add(Conv2D(5, kernel_size=(11,11), padding='same', activation='sigmoid'))

model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
# model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))

model.add(Dense(y_train.shape[1], activation='softmax'))
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_48 (Dense)             (None, 50)                25650     
_________________________________________________________________
dense_49 (Dense)             (None, 20)                1020      
_________________________________________________________________
reshape_26 (Reshape)         (None, 10, 2)             0         
_________________________________________________________________
conv1d_51 (Conv1D)           (None, 10, 250)           1750      
_________________________________________________________________
conv1d_52 (Conv1D)           (None, 6, 250)            312750    
_________________________________________________________________
conv1d_53 (Conv1D)           (None, 4, 7)              5257      
_________________________________________________________________
conv1d_54 (Conv1D)           (None, 4, 4)            

In [64]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
                    validation_data=(X_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [65]:
plot_metrics(history)

In [67]:
score = model.evaluate(X_valid, y_valid)
score[1]



0.5273708701133728

In [68]:
# PREDICTIONS
# get the prediction
y_pred = model.predict(X_valid, batch_size=batch_size, verbose=1)
# rounding the predictions
y_pred_binary = (y_pred > 0.5) * 1


# PRECISION & RECALL
predictions_results = []
thresholds=np.arange(.5, 1, 0.1).tolist()

for val in thresholds:
    pred = y_pred.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    accuracy = accuracy_score(y_valid, pred, normalize=True, sample_weight=None)#average='micro')
    res = []
    for i in range(0, y_valid.shape[1]):
       res.append(accuracy_score(y_valid[:,i], y_pred_binary[:,i]))
    accuracy_keras = np.mean(res)
    hamming = hamming_loss(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Accuracy all model': accuracy,
           'Accuracy average (keras)': accuracy_keras,
           'Hamming loss': hamming,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("Micro-average quality numbers:")
pd.DataFrame(predictions_results)

Micro-average quality numbers:


Unnamed: 0,Threshold,Accuracy all model,Accuracy average (keras),Hamming loss,Precision,Recall,F1-measure
0,0.5,0.352352,0.924387,0.086867,0.80589,0.332781,0.471049
1,0.6,0.320355,0.923522,0.088923,0.843851,0.288281,0.429749
2,0.7,0.27872,0.92278,0.091879,0.884381,0.241017,0.378801
3,0.8,0.215883,0.922137,0.09689,0.925141,0.181039,0.30282
4,0.9,0.126831,0.921574,0.105532,0.946381,0.097568,0.176898


In [69]:
# RESULTS PER LABEL
theme_results(y_valid, y_pred_binary)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Label,Y_count,Pred_count,Error,Accuarcy,Precision,Recall
0,CPD,344,181,0.089823,0.910177,0.80663,0.424419
1,CB,317,194,0.056669,0.943331,0.938144,0.574132
2,EWC,231,0,0.089052,0.910948,0.0,0.0
3,Exec,353,161,0.104086,0.895914,0.757764,0.345609
4,FWE,187,72,0.056669,0.943331,0.777778,0.299465
5,SP,252,80,0.081727,0.918273,0.75,0.238095
6,RE,204,0,0.078643,0.921357,0.0,0.0
7,Sup,258,0,0.09946,0.90054,0.0,0.0
8,SW,396,162,0.12953,0.87047,0.685185,0.280303
9,TEPE,605,473,0.09175,0.90825,0.887949,0.694215


## Model Toy Story 2
[Keras meets Universal Sentence Encoder](https://www.dlology.com/blog/keras-meets-universal-sentence-encoder-transfer-learning-for-text-data/)

In [None]:
# MODEL

max_features = X_train.shape[0] # comments
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 20
embed_size = 512 # for USE
n_class = 12 # 12 for themes and 62 for sub-themes

In [71]:
model = Sequential()
model.add(Dense(16, input_dim=X_train.shape[1], activation='relu'))
model.add(Reshape((8, 2)))
model.add(Conv1D(filters, kernel_size,  
                 padding='same',activation='relu'))
model.add(Reshape((125, 4, 4)))
model.add(Conv2D(5, kernel_size=(11,11), padding='same', activation='sigmoid'))
model.add(Dense(hidden_dims, activation = 'relu', 
                kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(Dropout(0.2))
model.add(MaxPooling2D(pool_size=(4,4)))
model.add(Reshape((50, 155)))
model.add(Conv1D(20, kernel_size=5, padding='valid', activation='relu'))
model.add(Conv1D(7, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dense(y_train.shape[1], activation='softmax'))
model.summary()


Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_52 (Dense)             (None, 16)                8208      
_________________________________________________________________
reshape_28 (Reshape)         (None, 8, 2)              0         
_________________________________________________________________
conv1d_55 (Conv1D)           (None, 8, 250)            1750      
_________________________________________________________________
reshape_29 (Reshape)         (None, 125, 4, 4)         0         
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 125, 4, 5)         2425      
_________________________________________________________________
dense_53 (Dense)             (None, 125, 4, 250)       1500      
_________________________________________________________________
dropout (Dropout)            (None, 125, 4, 250)     

In [73]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
         validation_data=(X_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [74]:
plot_metrics(history)

In [75]:
score = model.evaluate(X_valid, y_valid)
score



[0.21426476538181305, 0.6044718623161316]

In [76]:
# PREDICTIONS
# get the prediction
y_pred = model.predict(X_valid, batch_size=batch_size, verbose=1)
# rounding the predictions
y_pred_binary = (y_pred > 0.5) * 1


# PRECISION & RECALL
predictions_results = []
thresholds=np.arange(.5, 1, 0.1).tolist()

for val in thresholds:
    pred = y_pred.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    accuracy = accuracy_score(y_valid, pred, normalize=True, sample_weight=None)
    res = []
    for i in range(0, y_valid.shape[1]):
       res.append(accuracy_score(y_valid[:,i], y_pred_binary[:,i]))
    accuracy_keras = np.mean(res)
    hamming = hamming_loss(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Accuracy all model': accuracy,
           'Accuracy average (keras)': accuracy_keras,
           'Hamming loss': hamming,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("Micro-average quality numbers:")
pd.DataFrame(predictions_results)

Micro-average quality numbers:


Unnamed: 0,Threshold,Accuracy all model,Accuracy average (keras),Hamming loss,Precision,Recall,F1-measure
0,0.5,0.395143,0.921465,0.080281,0.857052,0.3712,0.518033
1,0.6,0.342328,0.921368,0.08494,0.897227,0.304035,0.45417
2,0.7,0.281419,0.921281,0.090112,0.931987,0.242399,0.384733
3,0.8,0.209715,0.921203,0.097276,0.966772,0.168878,0.287529
4,0.9,0.114495,0.921132,0.106464,0.990323,0.084854,0.156314


In [88]:
# RESULTS PER LABEL
print(np.mean(theme_results(y_valid, y_pred_binary)['Accuarcy']))
theme_results(y_valid, y_pred_binary)

0.9197185813415576


Unnamed: 0,Label,Y_count,Pred_count,Error,Accuarcy,Precision,Recall
0,CPD,344,126,0.09175,0.90825,0.920635,0.337209
1,CB,317,217,0.054742,0.945258,0.903226,0.618297
2,EWC,231,75,0.071704,0.928296,0.8,0.25974
3,Exec,353,110,0.103701,0.896299,0.881818,0.274788
4,FWE,187,70,0.048959,0.951041,0.928571,0.347594
5,SP,252,77,0.077487,0.922513,0.831169,0.253968
6,RE,204,28,0.075559,0.924441,0.642857,0.088235
7,Sup,258,68,0.09175,0.90825,0.647059,0.170543
8,SW,396,159,0.119892,0.880108,0.767296,0.308081
9,TEPE,605,492,0.077487,0.922513,0.910569,0.740496


## Based in Varada's Model
[Varada's tutorial](https://www.dlology.com/blog/keras-meets-universal-sentence-encoder-transfer-learning-for-text-data/)

In [None]:
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
# import matplotlib.pyplot as plt
import numpy as np
# import os
import pandas as pd
# import re
# import seaborn as sns
# import tensorflow_datasets as tfds
from tensorflow.keras import Input, layers
from tensorflow.keras.models import Model

In [34]:
# SHAPES
print("X_train", X_train.shape,
      "\ny_train", y_train.shape,
      "\n\nX_valid", X_valid.shape,
      "\ny_valid", y_valid.shape)

X_train (10376, 512) 
y_train (10376, 12) 

X_valid (2594, 512) 
y_valid (2594, 12)


In [None]:
# WE WON'T USE THIS, WE WILL GIVE DIRECTLY THE EMBEDDINGS.

# ### Use embeddings given by universal sentence encoder 
# model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
# hub_layer = hub.KerasLayer(model, output_shape=[512], input_shape=[], 
#                            dtype=tf.string, trainable=True)



In [58]:
# Let's build a CNN on the top of USE embeddings. The difference
# with Varada model is that we already will give the embeddings.
input = Input(shape=(512,), name="Input")#, dtype=tf.string)
# x = hub_layer(input)
x = Sequential()(input)
x = tf.keras.layers.Reshape(input_shape=(512,), target_shape=(512, 1))(x)
x = tf.keras.layers.Conv1D(128, 2, activation='relu', padding='same')(x)
x = tf.keras.layers.MaxPooling1D(5, padding='same')(x)
x = tf.keras.layers.Conv1D(128, 3, activation='relu', padding='same')(x)
x = tf.keras.layers.MaxPooling1D(5, padding='same')(x)
x = tf.keras.layers.Conv1D(128, 4, activation='relu', padding='same')(x)
x = tf.keras.layers.MaxPooling1D(40, padding='same')(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dropout(0.05)(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
output = tf.keras.layers.Dense(12, activation='sigmoid')(x)
m = Model(input, output)
m.summary()

Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           [(None, 512)]             0         
_________________________________________________________________
sequential_14 (Sequential)   (None, 512)               0         
_________________________________________________________________
reshape_13 (Reshape)         (None, 512, 1)            0         
_________________________________________________________________
conv1d_35 (Conv1D)           (None, 512, 128)          384       
_________________________________________________________________
max_pooling1d_33 (MaxPooling (None, 103, 128)          0         
_________________________________________________________________
conv1d_36 (Conv1D)           (None, 103, 128)          49280     
_________________________________________________________________
max_pooling1d_34 (MaxPooling (None, 21, 128)           0  

In [None]:
m.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [60]:
history = m.fit(X_train,
                y_train,
                epochs=20,
                batch_size=512,
                validation_data=(X_valid, y_valid),
                verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [39]:
plot_metrics(history)

In [None]:
# MODEL

max_features = X_train.shape[0] # comments
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 20
embed_size = 512 # for USE
n_class = 12 # 12 for themes and 62 for sub-themes

In [30]:
# PREDICTIONS
# get the prediction
y_pred = m.predict(X_valid, batch_size=batch_size, verbose=1)
# rounding the predictions
y_pred_binary = (y_pred > 0.5) * 1


# PRECISION & RECALL
predictions_results = []
thresholds=np.arange(.5, 1, 0.1).tolist()

for val in thresholds:
    pred = y_pred.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    accuracy = accuracy_score(y_valid, pred, normalize=True, sample_weight=None)
    res = []
    for i in range(0, y_valid.shape[1]):
       res.append(accuracy_score(y_valid[:,i], y_pred_binary[:,i]))
    accuracy_keras = np.mean(res)
    hamming = hamming_loss(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Accuracy all model': accuracy,
           'Accuracy average (keras)': accuracy_keras,
           'Hamming loss': hamming,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("Micro-average quality numbers:")
pd.DataFrame(predictions_results)

Micro-average quality numbers:


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Threshold,Accuracy all model,Accuracy average (keras),Hamming loss,Precision,Recall,F1-measure
0,0.5,0.002699,0.88377,0.11623,0.0,0.0,0.0
1,0.6,0.002699,0.88377,0.11623,0.0,0.0,0.0
2,0.7,0.002699,0.88377,0.11623,0.0,0.0,0.0
3,0.8,0.002699,0.88377,0.11623,0.0,0.0,0.0
4,0.9,0.002699,0.88377,0.11623,0.0,0.0,0.0


In [31]:
# RESULTS PER LABEL
print(np.mean(theme_results(y_valid, y_pred_binary)['Accuarcy']))
theme_results(y_valid, y_pred_binary)

0.8837702390131073


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Label,Y_count,Pred_count,Error,Accuarcy,Precision,Recall
0,CPD,344,0,0.132614,0.867386,0.0,0.0
1,CB,317,0,0.122205,0.877795,0.0,0.0
2,EWC,231,0,0.089052,0.910948,0.0,0.0
3,Exec,353,0,0.136083,0.863917,0.0,0.0
4,FWE,187,0,0.072089,0.927911,0.0,0.0
5,SP,252,0,0.097147,0.902853,0.0,0.0
6,RE,204,0,0.078643,0.921357,0.0,0.0
7,Sup,258,0,0.09946,0.90054,0.0,0.0
8,SW,396,0,0.15266,0.84734,0.0,0.0
9,TEPE,605,0,0.233231,0.766769,0.0,0.0


# BERT - CNN

## Load Datasets (BERT)

In [None]:
# LOAD DEPENDENCIES
# General
import pandas as pd
import numpy as np
import time

# Embeddings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer  #TD-IDF & Bag of Words
from tensorflow.keras.preprocessing.text import Tokenizer  #GloVe
import tensorflow as tf  #Universal Sentence Encoder
import tensorflow_hub as hub  #Universal Sentence Encoder
# from sentence_transformers import SentenceTransformer  #BERT

# Models
from keras.preprocessing.sequence import pad_sequences #Glove of CNN
# from keras.models import Sequential #Glove of CNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling2D, GlobalMaxPooling1D, MaxPool1D, MaxPooling1D
from tensorflow.keras.layers import Embedding, LSTM, Input, Lambda
from sklearn.metrics import precision_score, recall_score, f1_score #Precision & Recall
from tensorflow import keras


# To use GPU-Accelerated Machine Learning on MacOS
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

In [None]:
# LOAD DATASETS
from google.colab import drive
drive.mount('/content/drive')

root = "/content/drive/My Drive/Colab Notebooks/models_mds/data/"
X_train = np.load(root + "embedding_matrix_BERT_train.npy")
X_valid = np.load(root + "embedding_matrix_BERT_valid.npy")

# retrieve targets
y_train = np.load(root + "y_train_thm.npy") #"y_train_sub.npy")
y_valid = np.load(root + "y_valid_thm.npy") #"y_valid_sub.npy")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# SHAPES
print("X_train", X_train.shape,
      "\ny_train", y_train.shape,
      "\n\nX_valid", X_valid.shape,
      "\ny_valid", y_valid.shape)

X_train (10376, 1024) 
y_train (10376, 12) 

X_valid (2594, 1024) 
y_valid (2594, 12)


In [None]:
# RESHAPE
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_valid = X_valid.reshape(X_valid.shape[0], X_valid.shape[1], 1)

# SHAPES AFTER RESHAPING
print("X_train", X_train.shape,
      "\ny_train", y_train.shape,
      "\n\nX_valid", X_valid.shape,
      "\ny_valid", y_valid.shape)

X_train (10376, 1024, 1) 
y_train (10376, 12) 

X_valid (2594, 1024, 1) 
y_valid (2594, 12)


## Dense model

In [None]:
# MODEL

max_features = X_train.shape[0] # comments
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 50
embed_size = 1024 # for BERT Large
n_class = 12 # 12 for themes and 62 for sub-themes

In [None]:
# model = Sequential()
# model.add(Dense(max_features, input_shape=(embed_size,), activation = 'relu'))
# model.add(Dropout(0.2))
# model.add(Dense(128, activation='relu'))
# model.add(Dense(64, activation='relu'))
# model.add(Dense(128, activation='relu'))
# model.add(Dense(n_class, activation='sigmoid'))

# model.summary()

model = Sequential()
model.add(Dense(max_features, input_shape=(embed_size,), activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
# model.add(Dense(hidden_dims, activation = 'relu', 
#                 kernel_regularizer=tf.keras.regularizers.l1(0.01)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(hidden_dims, activation = 'relu', 
                kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu'))
model.add(Dense(hidden_dims, activation = 'relu', 
                kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu'))
model.add(Dense(n_class, activation='sigmoid'))

model.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_96 (Dense)             (None, 10376)             10635400  
_________________________________________________________________
dropout_26 (Dropout)         (None, 10376)             0         
_________________________________________________________________
dense_97 (Dense)             (None, 128)               1328256   
_________________________________________________________________
dropout_27 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_98 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_99 (Dense)             (None, 250)               16250     
_________________________________________________________________
dense_100 (Dense)            (None, 128)             

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
                    validation_data=(X_valid, y_valid))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
plot_metrics(history)

In [None]:
score = model.evaluate(X_valid, y_valid)
score



[0.2755376696586609, 0.6083269119262695]

In [None]:
# PREDICTIONS
predictions = model.predict(X_valid, batch_size=batch_size, verbose=1)
predictions



array([[1.63318347e-07, 5.96215244e-10, 1.00545625e-08, ...,
        6.97441055e-06, 9.99782622e-01, 1.57422910e-05],
       [1.53109049e-02, 1.37575649e-08, 1.22498311e-01, ...,
        1.96328383e-05, 5.21600668e-05, 9.04885121e-04],
       [3.64776952e-07, 9.23696016e-12, 1.88340800e-05, ...,
        9.99766886e-01, 1.58461611e-04, 4.04832140e-02],
       ...,
       [6.39006495e-03, 8.31575040e-03, 1.29553606e-03, ...,
        9.42052994e-03, 2.92370562e-03, 7.66272703e-03],
       [3.28869522e-02, 9.99804556e-01, 3.80805624e-03, ...,
        4.30172251e-04, 2.74885306e-03, 2.77867205e-02],
       [6.55099211e-05, 8.22515034e-08, 1.26015383e-03, ...,
        1.13762647e-01, 2.90754903e-03, 2.34790128e-02]], dtype=float32)

In [None]:
# PRECISION & RECALL
predictions_results = []
thresholds=np.arange(.5, 1, 0.1).tolist()

for val in thresholds:
    pred=predictions.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    accuracy = accuracy_score(y_valid, pred, normalize=True, sample_weight=None)#average='micro')
    hamming = hamming_loss(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Accuracy': accuracy,
           'Hamming loss': hamming,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("Micro-average quality numbers:")
pd.DataFrame(predictions_results)


Micro-average quality numbers:


Unnamed: 0,Threshold,Precision,Recall,F1-measure
0,0.6,0.737074,0.555556,0.63357
1,0.7,0.76254,0.521006,0.619048
2,0.8,0.791629,0.480929,0.598349
3,0.9,0.827605,0.425926,0.562409


## Based in Karan's model

In [None]:
# MODEL

max_features = X_train.shape[0] # comments
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 10
embed_size = 1024 # for BERT Large
n_class = 12 # 12 for themes and 62 for sub-themes

In [None]:
model = Sequential()
# model.add(Embedding(max_features, embed_size, weights=[glove_embeddings], trainable=False, input_length=maxlen))
# model.add(Conv1D(filters, kernel_size, input_shape=(embed_size,1), padding='valid',
#                  activation='relu', strides=1))
model.add(InputLayer(input_shape=(embed_size, 1)))

model.add(Dropout(0.2))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu',
                 strides=1))
model.add(MaxPooling1D())
model.add(Conv1D(filters, kernel_size, padding='valid',activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())

# L2 regularization
model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(Dense(n_class, activation = 'sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_1 (Dropout)          (None, 1024, 1)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1022, 250)         1000      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 511, 250)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 509, 250)          187750    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 254, 250)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 63500)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)              

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
         validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fccb423d0f0>

In [None]:
score = model.evaluate(X_valid, y_valid)
score



[0.24324536323547363, 0.5963762402534485]

This result are the same to the results from the previous run for the validation dataset.

In [None]:
# PRECISION & RECALL
predictions_results = []
thresholds=np.arange(.6, 1, 0.05).tolist()

model.build((None, max_features, embed_size))
for val in thresholds:
    pred=predictions.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("\nMicro-average quality numbers:")
pd.DataFrame(predictions_results)


Micro-average quality numbers:


Unnamed: 0,Threshold,Precision,Recall,F1-measure
0,0.6,0.836374,0.405473,0.546165
1,0.65,0.860144,0.362078,0.509628
2,0.7,0.885649,0.318961,0.46901
3,0.75,0.907942,0.278054,0.42573
4,0.8,0.930921,0.23466,0.374834
5,0.85,0.953103,0.190989,0.318213
6,0.9,0.97053,0.13654,0.239399
7,0.95,0.992126,0.069652,0.130165


## Model Toy Story 1

In [None]:
# RESHAPE
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])#, 1)
X_valid = X_valid.reshape(X_valid.shape[0], X_valid.shape[1])#, 1)

In [None]:
# MODEL

max_features = X_train.shape[0] # comments
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 20
embed_size = 1024 # for BERT Large
n_class = 12 # 12 for themes and 62 for sub-themes

In [None]:
model = Sequential()
model.add(Dense(50, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Reshape((100, 2)))
model.add(Conv1D(filters, kernel_size, padding='same',activation='relu'))
model.add(Conv1D(filters, kernel_size=5, padding='valid',activation='relu'))
model.add(Conv1D(7, kernel_size=3))
model.add(Conv1D(4, kernel_size=2, padding='same', activation='relu'))
model.add(Reshape((47, 4, 2)))
model.add(Conv2D(5, kernel_size=(11,11), padding='same', activation='sigmoid'))

model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
# model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))

model.add(Dense(y_train.shape[1], activation='softmax'))
model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_54 (Dense)             (None, 500)               512500    
_________________________________________________________________
dense_55 (Dense)             (None, 200)               100200    
_________________________________________________________________
reshape_32 (Reshape)         (None, 100, 2)            0         
_________________________________________________________________
conv1d_54 (Conv1D)           (None, 100, 250)          1750      
_________________________________________________________________
conv1d_55 (Conv1D)           (None, 96, 250)           312750    
_________________________________________________________________
conv1d_56 (Conv1D)           (None, 94, 7)             5257      
_________________________________________________________________
conv1d_57 (Conv1D)           (None, 94, 4)           

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
                    validation_data=(X_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
plot_metrics(history)

In [None]:
score = model.evaluate(X_valid, y_valid)
score



[0.2374565750360489, 0.6006168127059937]

In [None]:
# PREDICTIONS
predictions = model.predict(X_valid, batch_size=batch_size, verbose=1)
predictions



array([[3.4064411e-03, 3.0930381e-04, 1.7257120e-03, ..., 9.7238581e-04,
        7.8552586e-01, 2.8116783e-02],
       [1.0330575e-02, 1.8721510e-03, 5.0459463e-02, ..., 3.6506937e-03,
        9.3132332e-02, 6.1431386e-02],
       [1.9235390e-03, 6.8461767e-04, 1.9523524e-03, ..., 4.5186769e-02,
        5.2513488e-02, 8.8590878e-01],
       ...,
       [4.0681539e-03, 1.8738000e-02, 1.7338801e-02, ..., 3.2560520e-02,
        6.4045372e-03, 8.3810594e-03],
       [4.5915279e-03, 7.1370983e-01, 4.8158597e-03, ..., 2.7405489e-02,
        1.1430513e-03, 3.5889978e-03],
       [1.7970559e-03, 1.2436571e-03, 1.0067811e-03, ..., 9.0027995e-02,
        7.4820630e-02, 2.5949683e-02]], dtype=float32)

In [None]:
# PRECISION & RECALL
predictions_results = []
thresholds=np.arange(.6, 1, 0.1).tolist()

model.build((None, max_features, embed_size))
for val in thresholds:
    pred=predictions.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("\nMicro-average quality numbers:")
pd.DataFrame(predictions_results)


Micro-average quality numbers:


Unnamed: 0,Threshold,Precision,Recall,F1-measure
0,0.6,0.822341,0.382532,0.522166
1,0.7,0.854453,0.326147,0.472094
2,0.8,0.885766,0.270039,0.413895
3,0.9,0.920635,0.192371,0.318244


## Model Toy Story 2

In [None]:
# # RESHAPE
# X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
# X_valid = X_valid.reshape(X_valid.shape[0], X_train.shape[1], 1)

# SHAPES AFTER RESHAPING
print("X_train", X_train.shape,
      "\ny_train", y_train.shape,
      "\n\nX_valid", X_valid.shape,
      "\ny_valid", y_valid.shape)

X_train (10376, 1024) 
y_train (10376, 12) 

X_valid (2594, 1024) 
y_valid (2594, 12)


In [None]:
# MODEL

max_features = X_train.shape[0] # comments
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 20
embed_size = 1024 # for BERT Large
n_class = 12 # 12 for themes and 62 for sub-themes

In [None]:
model = Sequential()
model.add(InputLayer(input_shape=(embed_size, 1)))
model.add(Reshape((16, 8, 8)))
model.add(Conv2D(50, kernel_size=(11,11), padding='same', activation='relu'))
model.add(Dropout(0.4))
# model.add(Conv2D(5, kernel_size=(11,11), input_shape=(milad.shape[0],milad.shape[1],3), padding='same', activation='sigmoid'))
# model.add(Dense(200, activation='relu'))
# model.add(Reshape((100, 2)))
# model.add(Conv1D(filters, kernel_size, padding='same',activation='relu'))
# model.add(Conv1D(filters, kernel_size=5, padding='valid',activation='relu'))
# model.add(Conv1D(7, kernel_size=3))
# model.add(Conv1D(4, kernel_size=2, padding='same', activation='relu'))
# model.add(Reshape((47, 4, 2)))
# model.add(Conv2D(5, kernel_size=(11,11), padding='same', activation='sigmoid'))

# model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
# model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
# model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))

model.add(Dense(y_train.shape[1], activation='softmax'))
model.summary()

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_7 (Reshape)          (None, 16, 8, 8)          0         
_________________________________________________________________
conv2d_20 (Conv2D)           (None, 16, 8, 50)         48450     
_________________________________________________________________
dropout_12 (Dropout)         (None, 16, 8, 50)         0         
_________________________________________________________________
flatten_14 (Flatten)         (None, 6400)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 12)                76812     
Total params: 125,262
Trainable params: 125,262
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# Train Model
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
                    validation_data=(X_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
plot_metrics(history)

In [None]:
score = model.evaluate(X_valid, y_valid)
score



[0.2116403728723526, 0.6233615875244141]

In [None]:
# PREDICTIONS
predictions = model.predict(X_valid, batch_size=batch_size, verbose=1)
predictions



array([[7.6820201e-05, 2.8273737e-06, 7.1895454e-05, ..., 9.5831538e-03,
        6.8012220e-01, 4.1924794e-03],
       [1.1198854e-02, 9.2376041e-04, 7.3881827e-02, ..., 3.4707699e-02,
        6.7660086e-02, 9.6595483e-03],
       [1.5851082e-03, 1.0148665e-04, 2.7357673e-03, ..., 6.1893713e-01,
        1.0451363e-01, 2.6430476e-01],
       ...,
       [3.1342328e-02, 1.4586991e-02, 1.5204502e-03, ..., 1.9623322e-02,
        2.0681049e-03, 7.6039229e-03],
       [9.6801907e-04, 9.7208643e-01, 2.2694515e-04, ..., 9.2990638e-04,
        3.9620136e-05, 2.0838689e-04],
       [1.1233550e-02, 9.0272668e-05, 8.4557505e-05, ..., 7.0597172e-01,
        1.1349391e-02, 2.9937169e-02]], dtype=float32)

In [None]:
# PRECISION & RECALL
predictions_results = []
thresholds=np.arange(.6, 1, 0.1).tolist()

model.build((None, max_features, embed_size))
for val in thresholds:
    pred=predictions.copy()
    pred[pred>=val]=1
    pred[pred<val]=0

    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
   
    case= {'Threshold': val,
           'Precision': precision,
           'Recall': recall,
           'F1-measure': f1}
    predictions_results.append(case)

print("\nMicro-average quality numbers:")
pd.DataFrame(predictions_results)


Micro-average quality numbers:


Unnamed: 0,Threshold,Precision,Recall,F1-measure
0,0.6,0.868935,0.38115,0.529875
1,0.7,0.890719,0.328911,0.48042
2,0.8,0.918594,0.274461,0.422643
3,0.9,0.940431,0.205086,0.336737
