# Model with pretrained embeddings
    - Glove embeddings. Vectors of 300 dim.
    - Model: Basic LSTM.


In [0]:
# Configure to use tensorboard in colab

#CPU
#!pip install -q tensorflow==2.0.0-alpha0

#GPU
!pip install -q tensorflow-gpu==2.0.0-alpha0

# Load the TensorBoard notebook extension
%load_ext tensorboard.notebook

In [0]:
# Header
import os

import numpy as np
import tensorflow as tf
print('Tensorflow version: ', tf.__version__)
import time

#Show images
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 10) 



## Preprocess data

In [0]:
# link drive
from google.colab import drive
drive.mount('/content/gdrive')


In [0]:
# Import train and test data
save_path= './gdrive/My Drive/text_mining'

X_train = np.load(os.path.join(save_path, 'X_train.npy'))
y_train = np.load(os.path.join(save_path, 'y_train.npy'))
X_test  = np.load(os.path.join(save_path, 'X_test.npy'))
y_test  = np.load(os.path.join(save_path, 'y_test.npy'))

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


## Load embeddings and join with the current dictionary

In [0]:
# download embedings
! wget https://s3-eu-west-1.amazonaws.com/text-mining-course/glove.6B.100d.txt.zip
! unzip glove.6B.100d.txt.zip


In [0]:
#Load embeddings
import pandas as pd
import csv
import pickle

# Load worddict
with open(os.path.join(save_path, 'worddict.pickle'), 'rb') as pfile:
    worddict = pickle.load(pfile)

embed_dim = 100
df_glove = pd.read_csv("glove.6B.100d.txt", index_col=0 ,sep=' ',
                   header = None, quoting=csv.QUOTE_NONE, encoding='utf-8')

#Merge with the dictionary of the current texts: Inner join, only words in the corpus and in glove.
df_glove = df_glove.merge(pd.DataFrame.from_dict(worddict, orient='index'), left_index=True, right_index=True)
print('Merged words: ', df_glove.shape[0])

#Create dictionary: word_number_id --> [glove vector associated]
glove={}
for i,r in df_glove[:].iterrows():
    glove[int(r[0])] = [r[j] for j in range(1,embed_dim+1)]
print('Dictionary length: ', len(glove))

## Prepare sequences to model


In [0]:
#Create embeddings 3D tensors
max_len = 100

def embedd(x):
    r = np.zeros((max_len, embed_dim))
    pos = max_len-1
    for i in range(len(x),0,-1):
        found = True
        try:
            v = np.array([glove[x[i-1]]])
        except:
            found = False
        if found and pos>=0:
            r[pos,:] = v 
            pos += -1
    return r
        
X_train = np.array([embedd(s) for s in X_train], dtype=np.float32)
print('Train shape:', X_train.shape)

X_test = np.array([embedd(s) for s in X_test], dtype=np.float32)
print('Test shape:', X_test.shape)

# Save data in HDF5 to use with a batch generator

```
import h5py
with h5py.File(data_path + 'sentiment_glove_data.h5') as hdf5_f:
    hdf5_f.create_dataset('X_train', data=np.array(X_train))
    hdf5_f.create_dataset('y_train', data=np.array(y_train))
    hdf5_f.create_dataset('X_test' , data=np.array(X_test ))
    hdf5_f.create_dataset('y_test' , data=np.array(y_test ))

```



## Build model

In [0]:
# Model
num_hidden_rnn = 128 #Num of neurons in the Recurent network 


print('Build model 1 - Basic model...')

# LAYER 1: inputs
seq_prev_input = tf.keras.layers.Input(shape=(max_len, embed_dim), dtype='float32',) 

# LAYER 2: Create embedings
#embeds = tf.keras.layers.Embedding(max_features, dim_embedings, input_length=max_len)(seq_prev_input)

# LAYERS 3: RNN - forwards LSTM with dropout
forward = tf.keras.layers.LSTM(num_hidden_rnn, return_sequences=True,
                 dropout=0.3, recurrent_dropout=0.3, name='Forward1')(seq_prev_input)
rnn_out = tf.keras.layers.LSTM(num_hidden_rnn, return_sequences=False,
                 dropout=0.3, recurrent_dropout=0.3, name='Forward2')(forward)


# LAYER 4: Dense layer to outputs - softmax activation
output = tf.keras.layers.Dense(2, activation='softmax')(rnn_out)

# Model Architecture defined
model_1 = tf.keras.models.Model(inputs=seq_prev_input, outputs=output)
model_1.summary()

# Compile model and select optimizer
model_1.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])


In [0]:
# Train
batch_size = 128

print("Train...")
tbCallBack = tf.keras.callbacks.TensorBoard(log_dir='./tensorboard/sentiment/PretrainedEmbeds')
history = model_1.fit(X_train, y_train, batch_size=batch_size, epochs=20,
                      validation_data=(X_test, y_test), callbacks=[tbCallBack])


In [0]:
# Start tensorboard
%tensorboard --logdir ./tensorboard/sentiment


In [0]:
#Plot graphs in the notebook output
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.show()


## Validate it

In [0]:
# Score and obtain probabilities
pred_test = model_1.predict(X_test)
print(pred_test.shape)


In [0]:
#Import metrics
from sklearn.metrics import roc_curve, auc, accuracy_score

#Calculate accuracy with sklearn
print('Accuracy: ',accuracy_score(y_test, [1 if p>0.5 else 0 for p in pred_test[:,1]]))

#Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, pred_test[:,1])
print('AUC: ', auc(fpr, tpr) ) 

#Plot ROC curve
plt.plot(fpr, tpr)
