In [None]:
from transformers import (
   AutoConfig,
   AutoModel,
   AutoTokenizer,
   TFAutoModelForSequenceClassification,
   AdamW
#    glue_convert_examples_to_features
)
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_datasets as tfds
import json
import pandas as pd
import sklearn
import numpy as np

In [None]:
df = pd.read_csv('labeled/labeled.tsv', sep = '\t')
X_train, X_test, y_train, y_test = train_test_split(df['text'].to_list(), df['label'], test_size=0.2, random_state=42)

In [None]:
# Choose model
# @markdown >The default model is <i><b>COVID-Twitter-BERT</b></i>. You can however choose <i><b>BERT Base</i></b> or <i><b>BERT Large</i></b> to compare these models to the <i><b>COVID-Twitter-BERT</i></b>. All these three models will be initiated with a random classification layer. If you go directly to the Predict-cell after having compiled the model, you will see that it still runs the predition. However the output will be random. The training steps below will finetune this for the specific task. <br /><br /> 
model_name = 'digitalepidemiologylab/covid-twitter-bert' #@param ["digitalepidemiologylab/covid-twitter-bert", "bert-large-uncased", "bert-base-uncased"]

# Initialise tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def encode_fn(text_list):
    t_tokenizer = tokenizer(
        text_list,
        padding = True,
        truncation = True,
        max_length = max_seq_length,
        return_tensors='tf'
    )
    input_ids = t_tokenizer['input_ids']
    token_type_ids = t_tokenizer['token_type_ids']
    attention_mask = t_tokenizer['attention_mask']
    return input_ids,token_type_ids,attention_mask

In [None]:
# Paramteters
#@markdown >Batch size and sequence length needs to be set to prepare the data. The size of the batches depends on available memory. For Colab GPU limit batch size to 8 and sequence length to 96. By reducing the length of the input (max_seq_length) you can also increase the batch size. For a dataset like SST-2 with lots of short sentences. this will likely benefit training.
max_seq_length = 96 #@param {type: "integer"}
train_batch_size = 8 #@param {type: "integer"} 
eval_batch_size = 8 #@param {type: "integer"}


#@markdown >The Glue dataset has around 62000 examples, and we really do not need them all for training a decent model. To cut down training time, please reduse this to only a percentage of the entire set.
use_percentage_of_data = 100 #@param {type: "slider", min: 1, max: 100}

# get dataset sizes
num_train_examples = len(X_train)
num_dev_examples = len(X_test)
num_labels = 2

# Map the labels for printing
label_mapping = {"0": 0, "1": 1}

print(f'\n\nThe dataset is downloaded. The entire dataset has {num_train_examples + num_dev_examples} examples of which you are using {use_percentage_of_data}%. This will result in a train dataset with {int(num_train_examples * (use_percentage_of_data/100))} examples and a validation dataset with {int(num_dev_examples * (use_percentage_of_data/100))} examples.')

In [None]:
train_input_ids, train_token_type_ids, train_attention_mask = encode_fn(X_train)
dev_input_ids, dev_token_type_ids, dev_attention_mask = encode_fn(X_test)

In [None]:
#@markdown >The default learning rate of 2e5 will be fine in most cases
learning_rate = 2e-5 #@param {type: "number"}

#@markdown > Typically these type of models are finetuned for 3 epochs. This can be increased for small datasets and decreased for large datasets.
num_epochs = 1  #@param {type: "integer"}

# Initialise a Model for Sequence Classification with 2 labels
config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, config=config)

# Optimizer and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Metrics and callbacks
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
checkpoint_path = './checkpoints/checkpoint.{epoch:02d}'
callbacks = [tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True)]

# Compute some variables
train_steps_per_epoch = int(num_train_examples * (use_percentage_of_data/100) / train_batch_size)
dev_steps_per_epoch = int(num_dev_examples * (use_percentage_of_data/100) / eval_batch_size)


# Compile model
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Train the model
history = model.fit(
  [train_input_ids, train_token_type_ids, train_attention_mask], tf.constant(y_train.to_list()),
  epochs=num_epochs,
  validation_data=([dev_input_ids, dev_token_type_ids, dev_attention_mask], tf.constant(y_test.to_list())),
  callbacks=callbacks)

# Print some information about the training
print(f'\nThe training has finished training after {num_epochs} epochs.')
print('\nThe history contains the accuracy and loss at every epoch:')
print(json.dumps(history.history, indent=4))

print('\nThe checkpoint callback has generated a checkpoint after every epoch (loss being the training loss, val_loss is the validation loss):')
!ls -lha ./checkpoints/

print('\nWe will now save the finetuned model and the corresponding config file on your Colab disk.')
model.save_pretrained('./huggingface_model/')

print('\nTensorflow model and config-file is saved in ./huggingface_model/')
!ls -lha ./huggingface_model/

In [None]:
# Small function only used for formatting the output
def format_prediction(preds, label_mapping, label_name):
    preds = tf.nn.softmax(preds[0], axis=1)
    formatted_preds = []
    for pred in preds.numpy():
        # convert to Python types and sort
        pred = {label: float(probability) for label, probability in zip(label_mapping.values(), pred)}
        pred = {k: v for k, v in sorted(pred.items(), key=lambda item: item[1], reverse=True)}
        formatted_preds.append({label_name: list(pred.keys())[0], f'{label_name}_probabilities': pred})
    return formatted_preds

### Train Data Evaluation

In [None]:
train_pred = model(train_input_ids)
formatted_preds = format_prediction(train_pred, label_mapping, 'antivax')
train_p = [p['antivax'] for p in formatted_preds]

In [None]:
sklearn.metrics.confusion_matrix(y_true=y_train, y_pred=train_p)

In [None]:
sklearn.metrics.accuracy_score(y_true=y_train, y_pred=train_p)

In [None]:
sklearn.metrics.precision_recall_fscore_support(y_true=y_train, y_pred=train_p)

### Test Data Evaluation

In [None]:
test_pred = model(dev_input_ids)
formatted_preds = format_prediction(test_pred, label_mapping, 'antivax')
test_p = [p['antivax'] for p in formatted_preds]

In [None]:
sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=test_p)

In [None]:
sklearn.metrics.accuracy_score(y_true=y_test, y_pred=test_p)

In [None]:
sklearn.metrics.precision_recall_fscore_support(y_true=y_test, y_pred=test_p)

### Conservatives

In [None]:
con_df = pd.read_csv('labeled/con_post_label.csv', encoding= 'unicode_escape')
con_df = con_df[~con_df['antivax'].isin([-1, 2, 3, 4])]
input_ids, _, _ = encode_fn(con_df['Message'].to_list())
con_preds = model(input_ids)
formatted_preds = format_prediction(con_preds, label_mapping, 'antivax')
con_preds = [p['antivax'] for p in formatted_preds]

In [None]:
sklearn.metrics.confusion_matrix(y_true=con_df['antivax'], y_pred=con_preds)

In [None]:
sklearn.metrics.accuracy_score(y_true=con_df['antivax'], y_pred=con_preds)

In [None]:
sklearn.metrics.precision_recall_fscore_support(y_true=con_df['antivax'], y_pred=con_preds)

### Liberal

In [None]:
lib_df = pd.read_csv('labeled/lib_post_label.csv', encoding= 'unicode_escape')
lib_df = lib_df[~lib_df['antivax'].isin([-1, 2])]
input_ids, _, _ = encode_fn(lib_df['Message'].to_list())
lib_preds = model(input_ids)
formatted_preds = format_prediction(lib_preds, label_mapping, 'antivax')
lib_preds = [p['antivax'] for p in formatted_preds]

In [None]:
sklearn.metrics.confusion_matrix(y_true=lib_df['antivax'], y_pred=lib_preds)

In [None]:
sklearn.metrics.accuracy_score(y_true=lib_df['antivax'], y_pred=lib_preds)

In [None]:
sklearn.metrics.precision_recall_fscore_support(y_true=lib_df['antivax'], y_pred=lib_preds)

### Mom

In [None]:
mom_df = pd.read_csv('labeled/mom_post_label.csv', encoding= 'unicode_escape')
mom_df = mom_df[~mom_df['antivax'].isin([-1, 2])]
input_ids, _, _ = encode_fn(mom_df['Message'].to_list())
mom_preds = model(input_ids)
formatted_preds = format_prediction(mom_preds, label_mapping, 'antivax')
mom_preds = [p['antivax'] for p in formatted_preds]

In [None]:
sklearn.metrics.confusion_matrix(y_true=mom_df['antivax'], y_pred=mom_preds)

In [None]:
sklearn.metrics.accuracy_score(y_true=mom_df['antivax'], y_pred=mom_preds)

In [None]:
sklearn.metrics.precision_recall_fscore_support(y_true=mom_df['antivax'], y_pred=mom_preds)