# ALBERT Model applied to the Spooky Classification Dataset

- The goal of this notebook is to apply an ALBERT model to the spooky classification dataset, I could apply it to a more simple dataset like the Yelp Comments, IMDB reviews or the 20newsgroup dataset, however, as a Machine Learning exercise and because I want to improve my skills in the Tensorflow framework, I decided to use this dataset
- The key reference for this notebook is available in the official TF Documentation **https://www.tensorflow.org/tutorials/text/classify_text_with_bert**
<br><br>
**Note:** English is not my primary language, my apologies in advance for any grammar mistake or typo

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/spooky-author-identification/train.zip')

In [None]:
df.drop(columns='id', inplace=True)

In [None]:
df.head(2)

In [None]:
 df['text'] = df['text'].str.lower()

In [None]:
df.head(2)

In [None]:
df.author.value_counts()

In [None]:
authors_dict = {'EAP' : 0,
                'MWS' : 1,
                'HPL' : 2}

In [None]:
df['author'] = df['author'].map(authors_dict)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import log_loss, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['author'], test_size=0.30, 
                                                    stratify=df['author'], random_state=1234)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=2, max_features=5000, lowercase=False, stop_words=None)
scaler = MaxAbsScaler()
classifier = BernoulliNB(binarize=0, alpha=1)

In [None]:
pl = make_pipeline(vectorizer, classifier)

In [None]:
pl.fit(X_train, y_train)

In [None]:
preds_prob = pl.predict_proba(X_test)
preds_class = pl.predict(X_test)

In [None]:
nb_loss = log_loss(y_test, preds_prob)
print(nb_loss)

In [None]:
print(confusion_matrix(y_test, preds_class))
print('\n')
print(classification_report(y_test, preds_class))

These metrics can be used as a baseline, a pretty simple Naive Bayes model was fitted and achieved an accuracy of around **81%** not bad for a very simple model

In [None]:
#This is a key step, you have to specify the version in order to avoid the Kaggle's kernel to upgrade tensorflow 
#and another dependencies that generate an error when Tensorflow tries to recognize the GPU
!pip install tensorflow_text==2.3

In [None]:
import tensorflow
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
tensorflow.random.set_seed(1234)
np.random.seed(1234)
import random
random.seed(1234)

In [None]:
#extracted from the Tensorflow Hub
URL_PREPROCESSOR = "http://tfhub.dev/tensorflow/albert_en_preprocess/2"
URL_ENCODER = "https://tfhub.dev/tensorflow/albert_en_base/2"

In [None]:
#Adjusted model based on the example explained in the official documentation available in the following link: 
#https://www.tensorflow.org/tutorials/text/classify_text_with_bert
def build_classifier_model():
    text_input = tensorflow.keras.layers.Input(shape=(), dtype=tensorflow.string, name='text')
    preprocessing_layer = hub.KerasLayer(URL_PREPROCESSOR, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(URL_ENCODER, trainable=True, name='ALBERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tensorflow.keras.layers.Dropout(0.50)(net)
    net = tensorflow.keras.layers.Dense(3, activation='softmax', name='classifier_b')(net)
    return tensorflow.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

In [None]:
!pip install -q tf-models-official

In [None]:
from official.nlp import optimization  # to create AdamW optmizer

In [None]:
train_tf, test_tf = train_test_split(df, test_size=0.30, stratify=df['author'], random_state=1234)
print(train_tf.shape, test_tf.shape)

In [None]:
#Utility referenced in the TF documentation to "transform" the pandas dataframe to a TF Tensor
#https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers#create_an_input_pipeline_using_tfdata
def df_to_dataset(dataframe, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('author')
    ds = tensorflow.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.batch(batch_size)
    return ds

In [None]:
train_ds = df_to_dataset(train_tf, batch_size=32)

In [None]:
test_ds = df_to_dataset(test_tf, batch_size=32)

In [None]:
#Optimizer referenced in the documentation example
epochs = 5
steps_per_epoch = tensorflow.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss='sparse_categorical_crossentropy',
                         metrics=['accuracy'])

In [None]:
#to check the TF version and the GPU used in the training
import tensorflow
print(tensorflow.__version__)
tensorflow.test.gpu_device_name()

In [None]:
EPOCHS=3
with tensorflow.device('/device:GPU:0'):
    history = classifier_model.fit(x=train_ds, validation_data=test_ds, batch_size=32, epochs=EPOCHS)

In [None]:
classifier_model.evaluate(train_ds, batch_size=32)

In [None]:
classifier_model.evaluate(test_ds, batch_size=32)

In [None]:
tf_preds = classifier_model.predict(test_ds)

In [None]:
classifier_model.predict(X_test)[0:5]

In [None]:
y_test[0:5]

In [None]:
albert_loss = log_loss(y_test, tf_preds)
print(albert_loss)

In [None]:
#Multiclass Loss or Log Loss (Sklearn) is the metric defined by Kaggle for this competition
diff_loss = nb_loss - albert_loss
if albert_loss < nb_loss:
    print("ALBERT improved the loss metric by {}".format(diff_loss))
else: print("loss metric was not improved by ALBERT compared to the base NB model")

In [None]:
tf_preds[0:5], y_test.iloc[0:5]

In [None]:
import numpy as np
tf_preds_class = np.argmax(tf_preds, axis=1)

In [None]:
print(confusion_matrix(y_test, tf_preds_class))
print('\n')
print(classification_report(y_test, tf_preds_class))

In [None]:
dep = pd.read_csv('/kaggle/input/spooky-author-identification/test.zip')

In [None]:
dep.head(2)

In [None]:
dep_preds = classifier_model.predict(dep['text'].values)
dep_df = pd.DataFrame(data=dep_preds, columns=['EAP','MWS','HPL'])

In [None]:
submit_df = pd.concat([dep,dep_df], axis='columns')
submit_df.drop(columns='text', inplace=True)
submit_df.to_csv('submit.csv', index=False, index_label=False)

In [None]:
submit_df.head(2)

### Final Notes:
An ALBERT (A Light BERT Model) was applied to this dataset based on an example provided in the official TF documentation, same optimizer was used, however, base model was changed and an additional utility was used in order to transform the pandas dataframe to a TF tensor.

This is an example about how the different transformers available in the TF Hub can be used in a Text Classification Task.