In [None]:
!pip install -q transformers ekphrasis keras-tuner

# Imports & Preamble

In [None]:
import numpy as np
import pandas as pd
import urllib
import statistics
import math
import pprint
import sklearn
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import (
    Input,
    Dense,
    Embedding,
    Flatten,
    Dropout,
    GlobalMaxPooling1D,
    GRU,
    concatenate,
)
from tensorflow.keras.callbacks import EarlyStopping
from transformers import (
    DistilBertTokenizerFast,
    TFDistilBertModel,
    DistilBertConfig,
)

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import Tokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.dicts.noslang.slangdict import slangdict

import kerastuner

## Helper Function for Assessing Keras Models

In [None]:
def print_metrics(model, x_train, y_train, x_val, y_val):
    train_acc = dict(model.evaluate(x_train, y_train, verbose=0, return_dict=True))[
        "accuracy"
    ]
    val_acc = dict(model.evaluate(x_val, y_val, verbose=0, return_dict=True))[
        "accuracy"
    ]

    val_preds = model.predict(x_val)
    val_preds_bool = val_preds >= 0.5

    print("")
    print(f"Training Accuracy:   {train_acc:.2%}")
    print(f"Validation Accuracy: {val_acc:.2%}")
    print("")
    print(f"Validation f1 score: {sklearn.metrics.f1_score(val_preds_bool, y_val):.2%}")

# Instantiate Pretrained Bert Model & Tokenizer

In [None]:
# Using DistilBERT:
model_class, tokenizer_class, pretrained_weights = (TFDistilBertModel, DistilBertTokenizerFast, 'distilbert-base-uncased')

pretrained_bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

def get_pretrained_bert_model(config=pretrained_weights):
    if not config:
        config = DistilBertConfig(num_labels=2)

    return model_class.from_pretrained(pretrained_weights, config=config)



# Load and Examine Data 

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
print(train_df.info())

print("")
print("train rows:", len(train_df.index))
print("test rows:", len(test_df.index))

In [None]:
print("label counts:")
train_df.target.value_counts()

In [None]:
print("train precentage of nulls:")
print(round(train_df.isnull().sum() / train_df.count() * 100, 2))

In [None]:
print("test precentage of nulls:")
print(round(test_df.isnull().sum() / test_df.count() * 100, 2))

In [None]:
# check that we don't have any keywords appearing in one set and not the other
train_keywords = set(train_df["keyword"].dropna())
test_keywords = set(test_df["keyword"].dropna())

all_keywords = train_keywords.union(test_keywords)
unique_test_keywords = all_keywords - train_keywords
unique_train_keywords = all_keywords - test_keywords

print(f"unique_test_keywords: {unique_test_keywords}")
print(f"unique_train_keywords: {unique_train_keywords}")

# Preprocessing

In [None]:
# We'll use these weights later on to make up for the slightly imbalanced dataset
classes = np.unique(train_df["target"])
class_weights = sklearn.utils.class_weight.compute_class_weight(
    "balanced", classes=classes, y=train_df["target"]
)

class_weights = {clazz : weight for clazz, weight in zip(classes, class_weights)}

## Drop Duplicates

In [None]:
# Commented out the graceful handling of duplicated because the Kaggle kernel version of statistics.mode()
# won't handle multimodal results

# Duplicates aren't consistently labeled, so we keep one example of the most frequently occuring label
# train_df["duplicated"] = train_df.duplicated(subset="text")
# duplicated_tweets = train_df.loc[lambda df: df["duplicated"] == True, :]
# aggregated_duplicates = duplicated_tweets.groupby("text", as_index=False).aggregate(
#     statistics.mode
# )

# train_df.drop_duplicates(subset="text", inplace=True, keep=False)
# train_df = train_df.append(aggregated_duplicates, ignore_index=True)

train_df.drop_duplicates(subset="text", inplace=True, keep=False)
print("train rows:", len(train_df.index))
print("test rows:", len(test_df.index))

## Clean Tweets

In [None]:
class TweetPreProcessor:
    """
    This class does some cleaning and normalization prior to BPE tokenization
    """

    def __init__(self):

        self.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=[
                "url",
                "email",
                "phone",
                "user",
                "time",
                "date",
            ],
            # terms that will be annotated
            annotate={"repeated", "elongated"},
            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter="twitter",
            # corpus from which the word statistics are going to be used
            # for spell correction
            spell_correction=True,
            corrector="twitter",
            unpack_hashtags=False,  # perform word segmentation on hashtags
            unpack_contractions=False,  # Unpack contractions (can't -> can not)
            spell_correct_elong=True,  # spell correction for elongated words
            fix_bad_unicode=True,
            tokenizer=Tokenizer(lowercase=True).tokenize,
            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons, slangdict],
        )

    def preprocess_tweet(self, tweet):
        return " ".join(self.text_processor.pre_process_doc(tweet))
    
    # this will return the tokenized text     
    def __call__(self, tweet):
        return self.text_processor.pre_process_doc(tweet)
    
tweet_preprocessor = TweetPreProcessor()

In [None]:
# Have a look at how the TweetProcessor is doing
for tweet in train_df[100:120]["text"]:
    print("original:  ", tweet)
    print("processed: ", tweet_preprocessor.preprocess_tweet(tweet))
    print("")

In [None]:
train_df["text"] = train_df["text"].apply(tweet_preprocessor.preprocess_tweet)
test_df["text"] = test_df["text"].apply(tweet_preprocessor.preprocess_tweet)

## Clean Keywords

In [None]:
# Fill NA
train_df["keyword"].fillna("", inplace=True)
test_df["keyword"].fillna("", inplace=True)

# remove %20 from keywords
train_df["keyword"] = train_df["keyword"].apply(urllib.parse.unquote)
test_df["keyword"] = test_df["keyword"].apply(urllib.parse.unquote)

## Train-Test Split

In [None]:
x_train, x_val, y_train, y_val = sklearn.model_selection.train_test_split(
    train_df[["text", "keyword"]], train_df["target"], test_size=0.3, random_state=42, stratify=train_df["target"]
)

## Tokenisation and Encode 

In [None]:
def tokenize_encode(tweets, max_length=None):
    return pretrained_bert_tokenizer(
        tweets,
        add_special_tokens=True,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="tf",
    )


# need to be explicit about the lengths (instead of just specifying padding=True in the tokenizer)
# otherwise train tweets end up being 71 and validation tweets end up as 70, which causes problems/warnings
max_length_tweet = 72
max_length_keyword = 8

train_tweets_encoded = tokenize_encode(x_train["text"].to_list(), max_length_tweet) 
validation_tweets_encoded = tokenize_encode(x_val["text"].to_list(), max_length_tweet) 

train_keywords_encoded = tokenize_encode(x_train["keyword"].to_list(), max_length_keyword) 
validation_keywords_encoded = tokenize_encode(x_val["keyword"].to_list(), max_length_keyword) 

train_inputs_encoded = dict(train_tweets_encoded)
train_inputs_encoded["keywords"] = train_keywords_encoded["input_ids"]

validation_inputs_encoded = dict(validation_tweets_encoded)
validation_inputs_encoded["keywords"] = validation_keywords_encoded["input_ids"]


## Create TF Dataset

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(train_tweets_encoded), y_train)
)

val_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(validation_tweets_encoded), y_val)
)

train_multi_input_dataset = tf.data.Dataset.from_tensor_slices(
    (train_inputs_encoded, y_train)
)

val_multi_input_dataset = tf.data.Dataset.from_tensor_slices(
    (validation_inputs_encoded, y_val)
)


# Baseline with Logistic Regression on a TF-IDF Bag of Words

## Create TF-IDF Vectors

In [None]:
tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    tokenizer=tweet_preprocessor, min_df=1, ngram_range=(1, 1), norm="l2"
)

train_vectors = tfidf_vectorizer.fit_transform(raw_documents=x_train["text"]).toarray()
validation_vectors = tfidf_vectorizer.transform(x_val["text"]).toarray()

## Run Classifier

In [None]:
# I obtained the value of C by experimenting with LogisticRegressionCV but I'm leaving it out for brevity
logisticRegressionClf = LogisticRegression(n_jobs=-1, C=2.78)
logisticRegressionClf.fit(train_vectors, y_train)

def print_metrics_sk(clf, x_train, y_train, x_val, y_val):
    print(f"Train Accuracy:         {clf.score(x_train, y_train):.2%}")
    print(f"Validation Accuracy:    {clf.score(x_val, y_val):.2%}")
    print("")
    print(f"f1 score:               {sklearn.metrics.f1_score(y_val, clf.predict(x_val)):.2%}")

print_metrics_sk(logisticRegressionClf, train_vectors, y_train, validation_vectors, y_val)

# Feature Extraction with BERT

## Extract Sentence Vectors and Attention Embeddings

In [None]:
feature_extractor = get_pretrained_bert_model()

# Run a forward pass on the tokenized inputs
# model_outputs = feature_extractor(
#     train_tweets_encoded["input_ids"], train_tweets_encoded["attention_mask"]
# )
model_outputs = feature_extractor.predict(
    train_dataset.batch(32)
)
# BERT's sentence representation can be retrieved from a hidden vector at index 0 in the sequence, 
# (where the special token CLS was prepended by the tokenizer)
train_sentence_vectors = model_outputs.last_hidden_state[:, 0, :]

# The rest of the sequence contains the embeddings 
# (modified by successive layers of self-attention) for each token
train_word_vectors = model_outputs.last_hidden_state[:, 1:, :]

# And the same again for the validation set
# model_outputs = feature_extractor(
#     validation_tweets_encoded["input_ids"], validation_tweets_encoded["attention_mask"]
# )
model_outputs = feature_extractor.predict(
    val_dataset.batch(32)
)
validation_sentence_vectors = model_outputs.last_hidden_state[:, 0, :]
validation_word_vectors = model_outputs.last_hidden_state[:, 1:, :]

## Logistic Regression with BERT Sentence Vectors

In [None]:
logisticRegressionClf = LogisticRegression(n_jobs=-1, class_weight=class_weights)
logisticRegressionClf.fit(train_sentence_vectors, y_train)

print_metrics_sk(
    logisticRegressionClf,
    train_sentence_vectors,
    y_train,
    validation_sentence_vectors,
    y_val,
)

## RNN with BERT Attention Embeddings 

In [None]:
def create_gru_model() -> keras.Model:

    model = keras.Sequential()
    model.add(keras.layers.InputLayer(input_shape=train_word_vectors.shape[1:]))
    model.add(GRU(32, return_sequences=True))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1, activation="sigmoid"))

    model.compile(
        optimizer=keras.optimizers.Adam(),
        loss="binary_crossentropy",
        metrics=keras.metrics.BinaryAccuracy(name="accuracy"),
    )
    return model

model = create_gru_model()

history = model.fit(
    train_word_vectors,
    y_train,
    validation_data=(validation_word_vectors, y_val),
    class_weight=class_weights,
    epochs=20,
    verbose=0,
    callbacks=[
        EarlyStopping(
            monitor="val_accuracy",
            min_delta=0.001,
            patience=5,
            restore_best_weights=True,
        )
    ],
)

print_metrics(model, train_word_vectors, y_train, validation_word_vectors, y_val)

## Multi-Input Classifier with Sentence Vectors & Keywords

In [None]:
def create_multi_input_model() -> keras.Model:

    keyword_ids = keras.Input((8,), name="keywords")
    keyword_features = Embedding(input_dim=feature_extractor.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)
    keyword_features = Flatten()(keyword_features)
    keyword_features = Dense(1)(keyword_features)

    tweet_classification_vectors = keras.Input((train_sentence_vectors.shape[1],), name="tweets")
    tweet_features = Dense(1, activation='relu')(tweet_classification_vectors)    

    combined_features = concatenate([keyword_features, tweet_features])
    combined_prediction = Dense(1, activation="sigmoid")(combined_features)

    model = keras.Model(inputs = [keyword_ids, tweet_classification_vectors], outputs=combined_prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(),
        loss="binary_crossentropy",
        metrics=keras.metrics.BinaryAccuracy(name="accuracy"),
    )
    return model


model = create_multi_input_model()

train_inputs = {"keywords" : train_keywords_encoded["input_ids"], "tweets" : train_sentence_vectors}
validation_inputs = {"keywords" : validation_keywords_encoded["input_ids"], "tweets" : validation_sentence_vectors}

history = model.fit(
    train_inputs,
    y_train,
    validation_data=(validation_inputs, y_val),
    class_weight=class_weights,
    epochs=20,
    verbose=0,
    callbacks=[
        EarlyStopping(
            monitor="val_accuracy",
            min_delta=0.001,
            patience=5,
            restore_best_weights=True,
        )
    ],
)


print_metrics(model, train_inputs, y_train, validation_inputs, y_val)

## RNN with Attention Embeddings & Keywords

In [None]:
def create_multi_input_rnn_model() -> keras.Model:

    keyword_ids = keras.Input((8,), name="keywords")
    keyword_features = Embedding(input_dim=feature_extractor.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)
    keyword_features = Flatten()(keyword_features)
    keyword_features = Dense(1)(keyword_features)

    tweet_token_embeddings = Input(train_word_vectors.shape[1:], name="tweets")
    tweet_features = GRU(32, return_sequences=True)(tweet_token_embeddings)
    tweet_features = GlobalMaxPooling1D()(tweet_features)
    tweet_features = Dense(1, activation='relu')(tweet_features)    

    combined_features = concatenate([keyword_features, tweet_features])
    combined_prediction = Dense(1, activation="sigmoid")(combined_features)

    model = keras.Model(inputs = [keyword_ids, tweet_token_embeddings], outputs=combined_prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(),
        loss="binary_crossentropy",
        metrics=keras.metrics.BinaryAccuracy(name="accuracy"),
    )
    return model


model = create_multi_input_rnn_model()

train_inputs = {"keywords" : train_keywords_encoded["input_ids"], "tweets" : train_word_vectors}
validation_inputs = {"keywords" : validation_keywords_encoded["input_ids"], "tweets" : validation_word_vectors}

history = model.fit(
    train_inputs,
    y_train,
    validation_data=(validation_inputs, y_val),
    class_weight=class_weights,
    epochs=20,
    verbose=0,
    callbacks=[
        EarlyStopping(
            monitor="val_accuracy",
            min_delta=0.001,
            patience=5,
            restore_best_weights=True,
        )
    ],
)

print_metrics(model, train_inputs, y_train, validation_inputs, y_val)

# Architecture Search For Best Classification Head

In [None]:
def create_candidate_model_with_fx(hp: kerastuner.HyperParameters) -> keras.Model:

    keyword_ids = keras.Input((8,), name="keywords")
    keyword_features = Embedding(input_dim=feature_extractor.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)
    keyword_features = Flatten()(keyword_features)
    keyword_features = Dense(hp.Choice("keyword_units", values=[1, 8, 16, 32], default=1))(keyword_features)

    tweet_token_embeddings = Input(train_word_vectors.shape[1:], name="tweets")
    
    tweet_features = GRU(hp.Choice("GRU_units", values=[8, 16, 32, 64, 128], default=32), return_sequences=True)(tweet_token_embeddings)
    tweet_features = Dropout(hp.Float("GRU_dropout", min_value=0.0, max_value=0.5, step=0.1))(tweet_features)
    tweet_features = GlobalMaxPooling1D()(tweet_features)
    
    for i in range(hp.Int("num_layers", min_value=0, max_value=3, step=1)):
        tweet_features = Dense(hp.Choice("layer_" + str(i) + "_units", values=[2, 8, 16, 32, 64, 128, 256]), activation="relu")(tweet_features)
        tweet_features = Dropout(hp.Float("layer_" + str(i) + "_dropout", min_value=0.0, max_value=0.5, step=0.1))(tweet_features)
    
    combined_features = concatenate([keyword_features, tweet_features])
    combined_prediction = Dense(1, activation="sigmoid")(combined_features)

    model = keras.Model(inputs = [keyword_ids, tweet_token_embeddings], outputs=combined_prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(),
        loss="binary_crossentropy",
        metrics=keras.metrics.BinaryAccuracy(name="accuracy"),
    )
    return model

train_inputs = {"keywords" : train_keywords_encoded["input_ids"], "tweets" : train_word_vectors}
validation_inputs = {"keywords" : validation_keywords_encoded["input_ids"], "tweets" : validation_word_vectors}


In [None]:
# Hyperband Tuning
MAX_EPOCHS = 10
FACTOR = 3
ITERATIONS = 3

print(f"Number of models in each bracket: {math.ceil(1 + math.log(MAX_EPOCHS, FACTOR))}")
print(f"Number of epochs over all trials: {round(ITERATIONS * (MAX_EPOCHS * (math.log(MAX_EPOCHS, FACTOR) ** 2)))}")

In [None]:
tuner = kerastuner.Hyperband(
    create_candidate_model_with_fx,
    max_epochs=MAX_EPOCHS,
    hyperband_iterations=ITERATIONS, 
    factor=FACTOR, 
    objective="val_accuracy",
    directory="hyperparam-search",
    project_name="architecture-hyperband",
)

tuner.search(
    train_inputs,
    y_train,
    validation_data=(validation_inputs, y_val),
    class_weight=class_weights,
    epochs=10,
    verbose=1,
    callbacks=[
        EarlyStopping(
            monitor="val_accuracy",
            min_delta=0.001,
            patience=3,
            restore_best_weights=True,
        )
    ],
)


In [None]:
# tuner.results_summary()

In [None]:
best_model = tuner.get_best_models()[0]
# best_model.summary()
print("")
best_arch_hp = tuner.get_best_hyperparameters()[0]
pprint.pprint(best_arch_hp.values, indent=4)
print("")

print_metrics(best_model, train_inputs, y_train, validation_inputs, y_val)

# BERT Fine Tuning

N.B. Typically one might freeze the base model, train the added classifier for a bit, unfreeze the base model, lower the learning rate and train the whole model again. 

However Huggingface recommend that training an unfrozen model right from the beginning (with a low learning rate) works better with transformers. 

I tried both and there seemed to be no advantage to freeze-unfreeze. Sometimes it even reported an inferior score. However, it's hard to be certain given the large random fluctations between training runs with such a small dataset. I didn't test this with kfold validation which may have yielded more conclusive results. 

I have read in some papers that gradual unfreezing of the blocks in the base model can lead to better results.

## Fine-Tune BERT with Simple Head on Sentence Vector

In [None]:
# To create a baseline for the simplest possible fine-tuned BERT
def create_bert_simple_for_ft():
    input_ids = Input(shape=(max_length_tweet,), dtype="int32", name="input_ids")
    attention_mask = Input(shape=(max_length_tweet,), dtype="int32", name="attention_mask")

    pretrained_bert_model = get_pretrained_bert_model()
    bert_outputs = pretrained_bert_model(input_ids, attention_mask)

    prediction = Dense(1, activation="sigmoid")(bert_outputs.last_hidden_state[:, 0, :])
    return keras.Model(inputs=[input_ids, attention_mask], outputs=prediction)

model = create_bert_simple_for_ft()

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

model.fit(
    train_dataset.batch(32),
    validation_data=val_dataset.batch(32),
    class_weight=class_weights,
    epochs=20,
    callbacks=[
        EarlyStopping(
            monitor="val_accuracy",
            min_delta=0.001,
            patience=5,
            restore_best_weights=True,
        )
    ],
)

print_metrics(
    model, dict(train_tweets_encoded), y_train, dict(validation_tweets_encoded), y_val
)


## Fine-Tune BERT with RNN on Attention Embeddings and Keywords

In [None]:
def create_bert_rnn_for_ft():
    
    pretrained_bert_model = get_pretrained_bert_model()
    
    keyword_ids = keras.Input((8,), name="keywords")
    keyword_features = Embedding(input_dim=pretrained_bert_model.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)
    keyword_features = Flatten()(keyword_features)
    keyword_features = Dense(1)(keyword_features)

    input_ids = Input(shape=(max_length_tweet,), dtype="int32", name="input_ids")
    attention_mask = Input(shape=(max_length_tweet,), dtype="int32", name="attention_mask")
    bert_outputs = pretrained_bert_model(input_ids, attention_mask)

    bert_token_embeddings = bert_outputs.last_hidden_state[:, 1:, :]
    tweet_features = GRU(32, return_sequences=True)(bert_token_embeddings)
    tweet_features = GlobalMaxPooling1D()(tweet_features)

    combined_features = concatenate([keyword_features, tweet_features])
    combined_prediction = Dense(1, activation="sigmoid")(combined_features)

    model = keras.Model(inputs = [keyword_ids, input_ids, attention_mask], outputs=combined_prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=5e-5),
        loss="binary_crossentropy",
        metrics=keras.metrics.BinaryAccuracy(name="accuracy"),
    )
    return model

model = create_bert_rnn_for_ft()

model.fit(
    train_multi_input_dataset.batch(32),
    validation_data=val_multi_input_dataset.batch(32),
    epochs=20,
    class_weight=class_weights,
    callbacks=[
        EarlyStopping(
            monitor="val_accuracy",
            min_delta=0.001,
            patience=3,
            restore_best_weights=True,
        )
    ],
)

print_metrics(
    model, train_inputs_encoded, y_train, validation_inputs_encoded, y_val
)

## Fine-Tune BERT with With Best Classification Head

In [None]:
def create_model_candidate() -> keras.Model:
    pretrained_bert_model = get_pretrained_bert_model()

    keyword_ids = keras.Input((8,), name="keywords")
    keyword_features = Embedding(input_dim=pretrained_bert_model.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)
    keyword_features = Flatten()(keyword_features)
    keyword_features = Dense(best_arch_hp.get("keyword_units"))(keyword_features)

    input_ids = Input(shape=(max_length_tweet,), dtype="int32", name="input_ids")
    attention_mask = Input(shape=(max_length_tweet,), dtype="int32", name="attention_mask")
    bert_outputs = pretrained_bert_model(input_ids, attention_mask)
    bert_token_embeddings = bert_outputs.last_hidden_state[:, 1:, :]
    tweet_features = GRU(best_arch_hp.get("GRU_units"), return_sequences=True)(bert_token_embeddings)
    tweet_features = Dropout(best_arch_hp.get("GRU_dropout"))(tweet_features)
    tweet_features = GlobalMaxPooling1D()(tweet_features)
    
    for i in range(best_arch_hp.get("num_layers")):
        tweet_features = Dense(best_arch_hp.get("layer_" + str(i) + "_units"), activation="relu")(tweet_features)
        tweet_features = Dropout(best_arch_hp.get("layer_" + str(i) + "_dropout"))(tweet_features)
    
    combined_features = concatenate([keyword_features, tweet_features])
    combined_prediction = Dense(1, activation="sigmoid")(combined_features)

    model = keras.Model(inputs = [keyword_ids, input_ids, attention_mask], outputs=combined_prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=5e-5),
        loss="binary_crossentropy",
        metrics=keras.metrics.BinaryAccuracy(name="accuracy"),
    )
    return model


In [None]:
model = create_model_candidate()

history = model.fit(
    train_multi_input_dataset.batch(32),
    validation_data=val_multi_input_dataset.batch(32),
    epochs=6,
    class_weight=class_weights,
    callbacks=[
        keras.callbacks.EarlyStopping(
            monitor="val_accuracy", restore_best_weights=True
        )
    ],
)

best_epoch = len(history.history["val_accuracy"]) - 1

print_metrics(
    model, train_inputs_encoded, y_train, validation_inputs_encoded, y_val
)

# Create Submission

## Tokenize and Encode Test Set

In [None]:
test_tweets_encoded = tokenize_encode(test_df["text"].to_list(), max_length_tweet)
test_inputs_encoded = dict(test_tweets_encoded)
test_dataset = tf.data.Dataset.from_tensor_slices(test_inputs_encoded)

test_keywords_encoded = tokenize_encode(test_df["keyword"].to_list(), max_length_keyword)
test_inputs_encoded["keywords"] = test_keywords_encoded["input_ids"]
test_multi_input_dataset = tf.data.Dataset.from_tensor_slices(test_inputs_encoded)

## Train Model

In [None]:
full_train_dataset = train_multi_input_dataset.concatenate(val_multi_input_dataset)
model = create_model_candidate()

model.fit(
    full_train_dataset.batch(32),
    epochs=best_epoch,
    class_weight=class_weights,
)

## Save Predictions

In [None]:
preds = np.squeeze(model.predict(test_multi_input_dataset.batch(32)))
preds = (preds >= 0.5).astype(int)
pd.DataFrame({"id": test_df.id, "target": preds}).to_csv("submission.csv", index=False)