In [1]:
import tensorflow as tf
tf.debugging.set_log_device_placement(True)
import logging


def allow_soft_placement():
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    if  len(physical_devices) == 0:
        logging.warning("Not enough GPU hardware devices available")
    else:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)

allow_soft_placement()

import os
os.chdir('../src/models/')

import sys
from dotenv import load_dotenv, find_dotenv
import numpy as np
import pandas as pd
from hyperopt import space_eval
sys.path.append(os.path.abspath("../.."))
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()
# load up the entries as environment variables
load_dotenv(dotenv_path)
# import pandas as pd

import random
import matplotlib.pyplot as plt
from src.code_snippets.utils.data_handler import read_pickle,save_to_pickle
from src.code_snippets.models.many_to_one_lstm import BidirectionalLSTM
from src.code_snippets.models.hyperparameter_tuning import safeHyperopt,extract_trial_results
import seaborn as sns
#Input layer
from tensorflow.keras.layers import (Input)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
from src.code_snippets.dataprep.embeddings_preprocessing.data_preparation import pretrained_embedding_layer
from src.code_snippets.evaluation.model_evaluation import f1_metric, plot_metrics



In [2]:
from tensorflow.keras.layers import GlobalAveragePooling1D

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
seed = 100
random.seed(seed)
np.random.seed(seed)

In [5]:
train_data = read_pickle('../../data/processed/processed_data_train.pickle')
val_data = read_pickle('../../data/processed/processed_data_val.pickle')
test_data = read_pickle('../../data/processed/processed_data_test.pickle')


Create train,val and test pandas df

In [6]:
train = pd.DataFrame(train_data['X_indices'])
val = pd.DataFrame(val_data['X_indices'])

#Merge train and val
train = pd.concat([train,val],axis = 0)


test = pd.DataFrame(test_data['X_indices'])

#Sample a new val set from train. val size = test size.
train,val = train_test_split(train,test_size = len(test),random_state = 100)



## Lets predict the probability of data coming from test

Let's create out labels

In [6]:
train['is_test'] = 0
test['is_test'] = 1

df = pd.concat([train,test],axis=0)

Random guessing would yield 30% accuray

In [7]:
df['is_test'].mean() #40% of the data is test

0.4038127178171534

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_test',axis=1),
                                                    df['is_test'],
                                                    test_size = 0.1,
                                                    random_state=100)

In [9]:
train_data = {'X_indices':X_train,'y':y_train,'X_aux':np.zeros((1,1))}
test_data = {'X_indices':X_test,'y':y_test,'X_aux':np.zeros((1,1))}

In [10]:
trainer = BidirectionalLSTM(train_data,
                            test_data,
                            "../../data/raw/pretrained_embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt")


In [11]:
trainer.set_model(n_units=256,
             add_recurrent_layer=True,
             dropout=0.4,
             spatial_dropout=0.4,
             hidden_dense_units=128,
             learning_rate=0.0017436142055397088,
             global_avg_pool=False,
             global_max_pool=False
    )



Executing op RandomUniform in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Add in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RandomUniform in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Add in device /job:localhost/replica:0/task:0/device:GPU:

In [12]:
trainer.fit_model(epochs=6,batch_size = 128,use_early_stopping=False)

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Prefet

In [13]:
trainer.generate_metrics(trainer.val_data['X_indices'].values,
                         np.expand_dims(trainer.val_data['y'].values,axis=-1))

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AnonymousIteratorV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_distributed_function_55479 in device /job:localhost/replica:0/task:

0.7997021

80% F1 score seems too high for distributions that are supposed to be very similar... Let's compare by predicting probability of data coming from validation. Validation set was sampled randomly from training, so train and val distributions must be very similar. It is not 100% because there must be some similarities among distributions which make some comments hard to distinguish. 

In [23]:
from scikitplot.metrics import plot_confusion_matrix

## Lets predict the probability of data coming from validation

In [24]:
train_data = read_pickle('../../data/processed/processed_data_train.pickle')
val_data = read_pickle('../../data/processed/processed_data_val.pickle')
test_data = read_pickle('../../data/processed/processed_data_test.pickle')

In [25]:
train = pd.DataFrame(train_data['X_indices'])
val = pd.DataFrame(val_data['X_indices'])

#Merge train and val
train = pd.concat([train,val],axis = 0)


test = pd.DataFrame(test_data['X_indices'])

#Sample a new val set from train. val size = test size.
train,val = train_test_split(train,test_size = len(test),random_state = 100)



In [26]:
train['is_val'] = 0
val['is_val'] = 1

df = pd.concat([train,val],axis=0)

In [27]:
df['is_val'].mean() #40% of the data is val

0.4038127178171534

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_val',axis=1),
                                                    df['is_val'],
                                                    test_size = 0.1,
                                                    random_state=100)

In [29]:
train_data = {'X_indices':X_train,'y':y_train,'X_aux':np.zeros((1,1))}
test_data = {'X_indices':X_test,'y':y_test,'X_aux':np.zeros((1,1))}

In [30]:
trainer = BidirectionalLSTM(train_data,
                            test_data,
                            "../../data/raw/pretrained_embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt")


In [31]:
trainer.set_model(n_units=256,
             add_recurrent_layer=True,
             dropout=0.4,
             spatial_dropout=0.4,
             hidden_dense_units=128,
             learning_rate=0.0017436142055397088,
             global_avg_pool=False,
             global_max_pool=False
    )



In [32]:
trainer.fit_model(epochs=6,batch_size = 128,use_early_stopping=False)

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Prefet

In [34]:
trainer.generate_metrics(trainer.val_data['X_indices'].values,
                         np.expand_dims(trainer.val_data['y'].values,axis=-1))

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0


0.0

In [35]:
preds = trainer.model.predict(trainer.val_data['X_indices'].values)

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0


In [41]:
pd.DataFrame(preds,columns=['pred_proba']).describe()

Unnamed: 0,pred_proba
count,12941.0
mean,0.431288
std,0.009289
min,0.219932
25%,0.428412
50%,0.428499
75%,0.429808
max,0.47575


The model has absolutely no idea of what to do and it's therefore random guessing! We know this because almost all predictions (at least 75% of predictions) are very close to the random guessing probability of selecting a validation row from data (i.e. 40%). 

In other words, the model is not able to differentiate from validation and training splits because they come from the same distribution!. In fact, since 60% of the data comes from training distribution, the model is always predicting the majority class, which is is_val = 0. This is why we get f1 score = 0, because there are no true positives, making recall and precision = 0.