In [61]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
import matplotlib.pyplot as plt

from KAGGLE_NLP_with_disaster_tweets.model import base_model, constants, utils
from KAGGLE_NLP_with_disaster_tweets.data_preparation.utils import DataPipeline, BatchPipeline

In [63]:
train_file_name = 'train.csv'
test_file_name = 'test.csv'
sample_submission_file_name = 'sample_submission.csv'

vocabulary_size = 5000

data_pipeline = DataPipeline(train_file_name, test_file_name, sample_submission_file_name, output_sequence_length=50, vocabulary_size=vocabulary_size)
dataset, submission_test_dataset = data_pipeline.prepare_datasets(include_cols=["keyword"])

Getting the file: ../data/train.csv
-----------------------------------------------------------------------------------------
Dataset 
Size: 7613
Dataset examples:
Input: [  32  428    1    1 1909 4375    1 1254    7  962  102   66   24 1955
    5  314   97    1    1  868  514 1593  314    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
Target: [1. 0.]
Input: [   2  111   12  897    2    1    6   33    1    9    2  111   37    1
 1921 4897    1   33    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
Target: [1. 0.]
Input: [  26  289 2353   14 4020    6 3876 2285 4310   11    1   18 4547   59
   11   26    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
Target: [0. 1.]
--

In [64]:
dataframe = data_pipeline.dataframe
isTrueCount = dataframe["target"].where(dataframe["target"] == 1).count()
isFalseCount = dataframe["target"].where(dataframe["target"] == 0).count()
print(f"True instances: {isTrueCount} | {100 * isTrueCount / (isTrueCount + isFalseCount)}% of data")
print(f"False instances: {isFalseCount} | {100 * isFalseCount / (isTrueCount + isFalseCount)}% of data")

True instances: 3271 | 42.965979246026535% of data
False instances: 4342 | 57.034020753973465% of data


In [65]:
balanced_class_weights = utils.get_balanced_class_weights(dataframe)

In [66]:
batch_size = 32
constants.parameters["epochs"] = 10
log_directory = "./logs/hyperparameter_tuning_BaseModel/"

hparams = {
    constants.hyperparameters["optimizer"]: "adamw",
    constants.hyperparameters["batch_size"]: batch_size,
    constants.hyperparameters["learning_rate"]: 1e-3,
    constants.hyperparameters["class_weights"]: None,
    constants.hyperparameters["dropout"]: 0.1
}

model = base_model.BaseModel(
    vocabulary_size=vocabulary_size,
    embedding_dim=64,
    hidden_dim=128,
    lstm_dim=64,
    n_labels=2,
    epochs=10,
    batch_pipeline=BatchPipeline(dataset, submission_test_dataset, batch_size),
    hyperparameters=constants.hyperparameters,
    hparams=hparams,
    class_weights=balanced_class_weights)
debugging_dataset = dataset.take(2).batch(2)
model.test_model(debugging_dataset, 10)

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, None, 64)          320000    
                                                                 
 bidirectional_14 (Bidirecti  (None, None, 128)        66048     
 onal)                                                           
                                                                 
 dense_30 (Dense)            (None, None, 128)         16512     
                                                                 
 global_average_pooling1d_15  (None, 128)              0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_15 (Dropout)        (None, 128)               0         
                                                                 
 dense_31 (Dense)            (None, 2)               

In [67]:
accuracy, precision, recall, f1, predictions1 = model.fit_and_evaluate(log_directory=log_directory)
print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}")

run -> __lr=0.001__batch_size=32__optimizer=adamw__class_weights=None__dropout=0.1 starting...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [None]:
history_dict = model.history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

In [None]:
# batch_pipeline = BatchPipeline(dataset, submission_test_dataset, batch_size)
# predictions = base_model.predict_for_kaggle(batch_pipeline.submission_test_dataset)
# print(f"Predictions: {predictions}")

In [None]:
# submission_test_dataframe = data_pipeline.get_dataframe_from_csv(sample_submission_file_name).fillna(" ")
#
# utils.create_submission_for_kaggle("kaggle_predictions_BaseModel.txt", submission_test_dataframe["id"].values, predictions)

In [None]:
# %load_ext tensorboard

In [None]:
# tensorboard --logdir logs/hyperparameter_tuning --port 5000