In [1]:
import os
os.chdir('../src/models/')
import sys
from dotenv import load_dotenv, find_dotenv
import numpy as np

sys.path.append(os.path.abspath("../.."))
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()
# load up the entries as environment variables
load_dotenv(dotenv_path)
# import pandas as pd

from src.code_snippets.dataprep.embeddings_preprocessing.data_preparation import (
    sentences_to_indices,
    pretrained_embedding_layer
)

from src.code_snippets.utils.abstract_classes import Trainer
from src.code_snippets.dataprep.embeddings_preprocessing.glove.reader import read_glove_file,get_word_index_dicts
from src.code_snippets.dataprep.embeddings_preprocessing.data_preparation import pretrained_embedding_layer
import random


In [2]:

class ManyToOneSeqModel(Trainer):

    def __init__(self,train_data,val_data,embedding_dir):
        self.train_data = train_data
        self.val_data = val_data
        self.embedding_dir = embedding_dir
        self.gensim_model = read_glove_file(self.embedding_dir)
        self.word_to_index,self.index_to_words = get_word_index_dicts(self.gensim_model)
        
        self.m_X, self.n_X = self.train_data['X_indices_train'].shape
        self.m_X_aux, self.n_X_aux = self.train_data['X_aux_train'].shape
    
    def set_model(self):
        pretrained_embedding_layer(self.gensim_model, self.word_to_index)
    
    def save_model(self):
        pass
    def fit_model(self):
        pass
    def generate_metrics(self):
        pass

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Input,LSTM,Conv1D,Dropout, Dense
from tensorflow.keras import Model
import tensorflow_addons as tfa

In [24]:
seed = 100
random.seed(seed)
np.random.seed(seed)

In [6]:
train_data = {
              'X_aux_train': np.load('../../data/processed/X_aux_train.npy'),
              'X_indices_train':np.load('../../data/processed/X_indices_train.npy').astype('int32'),
              'y_train' : np.load('../../data/processed/y_train.npy')
             }

val_data = {
            'X_aux_val': np.load('../../data/processed/X_aux_val.npy'),
            'X_indices_val': np.load('../../data/processed/X_indices_val.npy').astype('int32'),
            'y_val' : np.load('../../data/processed/y_val.npy')
           }

In [30]:
num_observations = len(train_data['X_aux_train'])
idx = np.arange(num_observations)

sample_portion = 0.01
sample_size = int(num_observations*sample_portion)
print(sample_size)

1531


In [42]:
np.random.seed(seed)
sampled_idx = np.random.choice(idx,sample_size,replace = False)

train_data['X_indices_train'] = train_data['X_indices_train'][sampled_idx]
train_data['X_aux_train'] = train_data['X_aux_train'][sampled_idx]
train_data['y_train'] = train_data['y_train'][sampled_idx]

In [44]:
trainer = ManyToOneSeqModel(train_data,
                            val_data,
                            "../../../../pretrained_embeddings/glove.twitter.27B/glove.twitter.27B.25d.txt")


In [45]:
sentence_indices = Input((trainer.n_X),dtype='int32')

# Create the embedding layer pretrained with GloVe Vectors (≈1 line)
embedding_layer = pretrained_embedding_layer(trainer.gensim_model,
                                             trainer.word_to_index)

# Propagate sentence_indices through your embedding layer
# (See additional hints in the instructions).
embeddings = embedding_layer(sentence_indices)   

X = LSTM(128,return_sequences=True)(embeddings)

X = Dropout(0.1,seed= seed)(X)

X = LSTM(128,return_sequences=False)(X)

X = Dropout(0.1,seed= seed)(X)

X = Dense(16,activation='relu')(X)

X = Dropout(0.1,seed= seed)(X)

X = Dense(1,activation='sigmoid')(X)


model = Model(sentence_indices,X)

In [46]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 3000)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 3000, 25)          29837875  
_________________________________________________________________
lstm (LSTM)                  (None, 3000, 128)         78848     
_________________________________________________________________
dropout (Dropout)            (None, 3000, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 16)                2064  

In [47]:
import tensorflow.keras.backend as K

def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val


In [48]:
model.compile(loss='binary_crossentropy',metrics = [f1_metric])


In [None]:
model.fit(trainer.train_data['X_indices_train'],
          trainer.train_data['y_train'],
          epochs=50,
          batch_size=2**5,
          shuffle=True)

Train on 1531 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50