In [1]:
import tensorflow as tf
tf.debugging.set_log_device_placement(True)
import logging


def allow_soft_placement():
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    if  len(physical_devices) == 0:
        logging.warning("Not enough GPU hardware devices available")
    else:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)

allow_soft_placement()

import os
os.chdir('../src/models/')

import sys
from dotenv import load_dotenv, find_dotenv
import numpy as np
import pandas as pd
from hyperopt import space_eval
sys.path.append(os.path.abspath("../.."))
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()
# load up the entries as environment variables
load_dotenv(dotenv_path)
# import pandas as pd

import random
import matplotlib.pyplot as plt
from src.code_snippets.utils.data_handler import read_pickle,save_to_pickle
from src.code_snippets.models.many_to_one_lstm import BidirectionalLSTM
from src.code_snippets.models.hyperparameter_tuning import safeHyperopt,extract_trial_results
import seaborn as sns
#Input layer
from tensorflow.keras.layers import (Input)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
from src.code_snippets.dataprep.embeddings_preprocessing.data_preparation import pretrained_embedding_layer
from src.code_snippets.evaluation.model_evaluation import f1_metric, plot_metrics



In [2]:
from tensorflow.keras.layers import GlobalAveragePooling1D

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
seed = 100
random.seed(seed)
np.random.seed(seed)

In [16]:
train_data = read_pickle('../../data/processed/processed_data_train.pickle')
val_data = read_pickle('../../data/processed/processed_data_val.pickle')
test_data = read_pickle('../../data/processed/processed_data_test.pickle')


Create train,val and test pandas df

In [17]:
#Going to undersample train to balance dataset a bit.
train = pd.DataFrame(train_data['X_indices'])
train = train.sample(50000)

test = pd.DataFrame(test_data['X_indices'])
val = pd.DataFrame(val_data['X_indices'])

## Lets predict the probability of data coming from test

Let's create out labels

In [18]:
train['is_test'] = 0
test['is_test'] = 1

df = pd.concat([train,test],axis=0)

Random guessing would yield 30% accuray

In [19]:
df['is_test'].mean() #47% of the data is test

0.15346065285114452

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_test',axis=1),
                                                    df['is_test'],
                                                    test_size = 0.1,
                                                    random_state=100)

In [21]:
train_data = {'X_indices':X_train,'y':y_train,'X_aux':np.zeros((1,1))}
test_data = {'X_indices':X_test,'y':y_test,'X_aux':np.zeros((1,1))}

In [22]:
trainer = BidirectionalLSTM(train_data,
                            test_data,
                            "../../data/raw/pretrained_embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt")


In [23]:
trainer.set_model(n_units=256,
             add_recurrent_layer=True,
             dropout=0.4,
             spatial_dropout=0.4,
             hidden_dense_units=128,
             learning_rate=0.0017436142055397088,
             global_avg_pool=False,
             global_max_pool=False
    )



Executing op DestroyResourceOp in device /job:localhost/replica:0/task:0/device:GPU:0


In [24]:
trainer.fit_model(epochs=6,batch_size = 128,use_early_stopping=False)

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Epoch 1/6
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing 

Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Epoch 5/6
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Exe

In [25]:
trainer.generate_metrics(trainer.val_data['X_indices'].values,
                         np.expand_dims(trainer.val_data['y'].values,axis=-1))

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op __inference_predict_function_92608 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ConcatV2 in device /job:localhost/replica:0/task:0/device:GPU:0


0.0

In [26]:
preds = trainer.model.predict(trainer.val_data['X_indices'].values)

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0


In [27]:
pd.DataFrame(preds,columns=['pred_proba']).describe()

Unnamed: 0,pred_proba
count,5907.0
mean,0.157273
std,0.016767
min,0.129713
25%,0.139107
50%,0.166048
75%,0.172365
max,0.244734


## Lets predict the probability of data coming from validation

In [None]:
train.drop('is_test',axis=1,inplace=True)

In [35]:
train['is_val'] = 0
val['is_val'] = 1

df = pd.concat([train,val],axis=0)

In [36]:
df['is_val'].mean() #40% of the data is val

0.15327428832703935

In [37]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_val',axis=1),
                                                    df['is_val'],
                                                    test_size = 0.1,
                                                    random_state=100)

In [38]:
train_data = {'X_indices':X_train,'y':y_train,'X_aux':np.zeros((1,1))}
test_data = {'X_indices':X_test,'y':y_test,'X_aux':np.zeros((1,1))}

In [39]:
trainer = BidirectionalLSTM(train_data,
                            test_data,
                            "../../data/raw/pretrained_embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt")


In [40]:
trainer.set_model(n_units=256,
             add_recurrent_layer=True,
             dropout=0.4,
             spatial_dropout=0.4,
             hidden_dense_units=128,
             learning_rate=0.0017436142055397088,
             global_avg_pool=False,
             global_max_pool=False
    )



In [41]:
trainer.fit_model(epochs=6,batch_size = 128,use_early_stopping=False)

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Epoch 1/6
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing 

Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Epoch 5/6
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executin

In [45]:
trainer.generate_metrics(trainer.val_data['X_indices'].values,
                         np.expand_dims(trainer.val_data['y'].values,axis=-1))

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0


0.0

In [46]:
preds = trainer.model.predict(trainer.val_data['X_indices'].values)

Executing op RangeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FlatMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ZipDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0


In [47]:
pd.DataFrame(preds,columns=['pred_proba']).describe()

Unnamed: 0,pred_proba
count,5906.0
mean,0.044612
std,0.024397
min,0.039269
25%,0.040603
50%,0.041242
75%,0.041821
max,0.285428
