# Multi Model Building

In [None]:
# Remove TF logging warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
import pandas as pd
import tensorflow as tf
import keras
from keras import layers
from model_helper_functions import create_fit_and_save_model
from data_pipeline import prep_data, get_train_val_test_data
from itertools import product
import re

# Import correct optimizer
import platform
if platform.system() == "Darwin" and platform.processor() == "arm":
    from keras.optimizers.legacy import RMSprop, Adam
else:
    from keras.optimizers import RMSprop, Adam

In [None]:
# Remove TF logging warnings
tf.get_logger().setLevel('ERROR')

In [None]:
tf.random.set_seed(15)
keras.utils.set_random_seed(15)

In [None]:
data_dir = '../data'
df = prep_data(data_dir)
train_df, val_df, test_df = get_train_val_test_data(df, use_half_data=False)

---

## Hyperparameter Grid

In [None]:
vocab_sizes = [4000]
embed_sizes_with_pretrained_model = [
    [100,'glove-wiki-gigaword-100'],
    [300,'word2vec-google-news-300'],
    [200, None]
]
batch_sizes = [32, 64]
bidirectional_options = [True, False]
rnn_layers = [layers.GRU, layers.LSTM]
rnn_units_configs = [
    [8],
    [32],
    [32, 16],
]
dense_units_configs = [
    [1],
    [32,1]
]
activations = ['sigmoid']
final_dropouts = [0.5]
optimizers = [RMSprop, Adam]

In [None]:
all_combinations = list(product(vocab_sizes, embed_sizes_with_pretrained_model, batch_sizes, bidirectional_options, rnn_layers,
                                rnn_units_configs, dense_units_configs, activations, final_dropouts, optimizers))
print(f'There are {len(all_combinations)} parameter combinations to run.')

In [None]:
# Keep track of model fitting in order to resume at a later time if needed.
progress_file = './model_checkpoints/model_building_progress.csv'
if os.path.exists(progress_file):
    progress = pd.read_csv(progress_file)
else:
    progress = pd.DataFrame.from_records(all_combinations, columns=['vocab_size', 'embed_size_with_pretrained_model', 
                                                                    'batch_size', 'bidirectional', 'rnn_layer', 'rnn_units', 
                                                                    'dense_units', 'activation', 'final_dropout', 'optimizer'])
    progress.insert(0, 'model', range(len(all_combinations)))
    progress['finished'] = False
    class_names = '(GRU|LSTM|Adam|RMSprop)'
    progress['rnn_layer'] = progress['rnn_layer'].apply(lambda x: re.sub(f'.*{class_names}.*','\\1',str(x)))
    progress['optimizer'] = progress['optimizer'].apply(lambda x: re.sub(f'.*{class_names}.*','\\1',str(x)))
    
    progress.to_csv(progress_file, index=False)
    progress = pd.read_csv(progress_file)

---

## Fit Models

In [None]:
EPOCHS = 15

In [None]:
for i, params in enumerate(all_combinations):
    if progress.at[i, 'finished']:
        print(f'Model {i} has already been fitted.')
    else:
        try:
            f1_score = create_fit_and_save_model(f'model{i}',train_df, val_df, test_df, EPOCHS, params)
            progress.at[i, 'finished'] = True
            progress.to_csv(progress_file, index=False)
            print(f'Model {i} finished with test f1_score of {f1_score:.4f}')
        except:
            print(f'ERROR fitting model {i}')

---