In [199]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [200]:
import os, shutil

os.environ["CUDA_VISIBLE_DEVICES"]="-1"

log_dir = f'./logs/hyperparameter_tuning_BaseModel/'

In [201]:
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp

from KAGGLE_NLP_with_disaster_tweets.model import utils, constants
from KAGGLE_NLP_with_disaster_tweets.data_preparation.utils import DataPipeline

from tensorboard import notebook

In [202]:
train_file_name = 'train.csv'
kaggle_test_file_name = 'test.csv'
sample_submission_file_name = 'sample_submission.csv'

max_vocabulary_size = 20000
glove_embedding_dim = 200

data_pipeline = DataPipeline(train_file_name, kaggle_test_file_name, sample_submission_file_name,
                             max_vocabulary_size=max_vocabulary_size,
                             output_sequence_length=30,
                             glove_embedding_dim=glove_embedding_dim,
                             glove_url=constants.glove_url['twitter.27B'])
dataset = data_pipeline.prepare_train_dataset(include_cols=["keyword", "location"], extract_extras=True)

Getting the file: ../data/sample_submission.csv
Getting the file: ../data/train.csv
Dataframe size before eliminating too short texts: 7613
Dataframe size after eliminating too short texts: 7529
         id keyword location  \
0         1                    
1         4                    
2         5                    
3         6                    
4         7                    
...     ...     ...      ...   
7608  10869                    
7609  10870                    
7610  10871                    
7611  10872                    
7612  10873                    

                                                   text  target  
0     our deeds are the reason of this earthquake ma...       1  
1                 forest fire near la ronge sask canada       1  
2     all residents asked to shelter in place are be...       1  
3     13  0 people receive wildfires evacuation orde...       1  
4     just got sent this photo from ruby alaska as s...       1  
...                     

In [203]:
balanced_class_weights = utils.get_balanced_class_weights(data_pipeline.dataframe)

In [204]:
# create a dict to keep the hyperparameters

hparams = {
    hp.HParam("optimizer", hp.Discrete(["adamw"])): "adamw",
    hp.HParam("batch_size", hp.Discrete([16, 32, 64])): 16,
    hp.HParam("learning_rate", hp.Discrete([5e-4, 1e-4])): 1e-4,
    hp.HParam("class_weights", hp.Discrete(["none", "balanced"])): "balanced",
    hp.HParam("dropout", hp.Discrete([0.1, 0.4])): 0.1
}
# easier to manage hyperparameters with this wrapper class
hyperparameter_manager = utils.HyperparameterManager(hparams=hparams)

In [205]:
session_len = 1
for key in list(hparams.keys()):
    session_len *= len(key.domain.values)

print(session_len)

24


In [206]:
# remove previous logs
if os.path.exists(log_dir):
    shutil.rmtree(log_dir)

os.makedirs(log_dir)

with tf.summary.create_file_writer(log_dir).as_default():
    hp.hparams_config(hparams=list(hparams.keys()),
                      metrics=constants.metrics)

In [207]:
from KAGGLE_NLP_with_disaster_tweets.model.base_model import BaseModel
from KAGGLE_NLP_with_disaster_tweets.data_preparation.utils import BatchPipeline


def set_model_hparams(data_pipeline, hparam_manager, batch_pipeline, epochs: int, embeddings_initializer):
    """
    Convenience method for initializing the model for hyperparameter optimization
    """
    return BaseModel(
            vocabulary_size=data_pipeline.vocabulary_size,
            embedding_dim=data_pipeline.glove_embedding_dim,
            lstm_dims=[128, 64],
            hidden_dim=64,
            num_classes=2,
            epochs=epochs,
            batch_pipeline=batch_pipeline,
            hparam_manager=hparam_manager,
            embeddings_initializer=embeddings_initializer)


def find_best_hparams(dataset: tf.data.Dataset, data_pipeline, hyperparameter_manager, epochs: int,
                      balanced_class_weights: dict(), log_directory: str):
    """
    method iteratively looks for all possible hyperparameters in the search domain
    :param dataset: tf.data.Dataset
    :param data_pipeline: A DataPipeline instance
    :param hyperparameter_manager: A HyperparameterManager instance
    :param epochs: epochs for hparam optimization
    :param balanced_class_weights: dict() of balanced class weights
    :param log_directory: where the logs should be saved
    """
    session_num = 1
    # initialize embedding initializer once
    embeddings_initializer = data_pipeline.build_embeddings_initializer()
    # look for the best hyperparameters
    for optimizer in hyperparameter_manager.optimizer_hparams.domain.values:
        for batch_size in hyperparameter_manager.batch_size_hparams.domain.values:
            for learning_rate in hyperparameter_manager.learning_rate_hparams.domain.values:
                for dropout in hyperparameter_manager.dropout_hparams.domain.values:
                    for class_weights in hyperparameter_manager.class_weights_hparams.domain.values:
                        print(f"\n********************    Session {session_num}/{session_len} started    **********************\n")
                        hparams = hyperparameter_manager.set_hparams(optimizer, batch_size, learning_rate,
                                                                     class_weights, dropout)
                        batch_pipeline = BatchPipeline(dataset, batch_size)
                        model = set_model_hparams(data_pipeline, hyperparameter_manager, batch_pipeline, epochs,
                                                  embeddings_initializer)

                        with tf.summary.create_file_writer(f'{log_directory}{model.run_name}').as_default():
                            hp.hparams(hparams)
                            accuracy, precision, recall, f1, predictions = model.fit_and_evaluate(
                                class_weights=balanced_class_weights,
                                log_directory=log_directory)
                            tf.summary.scalar("accuracy", accuracy, step=1)
                            tf.summary.scalar("precision", precision, step=1)
                            tf.summary.scalar("recall", recall, step=1)
                            tf.summary.scalar("f1", f1, step=1)
                        print(f"\n*********************    Session {session_num}/{session_len} ended    ***********************\n")
                        session_num += 1


In [208]:
find_best_hparams(dataset=dataset,
                  data_pipeline=data_pipeline,
                  hyperparameter_manager=hyperparameter_manager,
                  epochs=10,
                  balanced_class_weights=balanced_class_weights,
                  log_directory=log_dir)


Loading the file: glove.twitter.27B.200d.txt 

Found 1193514 word vectors 

Converted 12866 words, and missed 4707 words.

********************    Session 1/24 started    **********************

Values are set: Optimizer: adamw | Learning Rate: 0.0001 | Batch Size: 16 | Class Weights: balanced | Dropout Rate: 0.1
Model: "sequential_119"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_119 (Embedding)   (None, None, 200)         3514600   
                                                                 
 dropout_238 (Dropout)       (None, None, 200)         0         
                                                                 
 bidirectional_333 (Bidirect  (None, None, 256)        336896    
 ional)                                                          
                                                                 
 bidirectional_334 (Bidirect  (None, None, 128)        164352    
 

In [209]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [210]:
tensorboard --logdir logs/hyperparameter_tuning_BaseModel --port 5003

Launching TensorBoard...