Side comment: I will delete this notebook after figuring out how can we fine-tune BERT-base-cased for classification with Pytorch, by training just the last Linear and Pooling layer.

### Pytorch installation

In [None]:
import subprocess

print("NVIDIA Graphics Card Driver: ", subprocess.getoutput("nvidia-smi")[:980])
print("CUDA version: ", subprocess.getoutput("nvcc --version"), "\n")

In [None]:
# Check version here:
# https://pytorch.org/get-started/previous-versions/

# !pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
!pip show torch

### Get device

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

### Preprocess data

In [None]:
import pandas as pd
from typing import List, Dict
import re

def prepare_data(raw: pd.DataFrame) -> List[Dict]:
    def _clean_text(row: pd.DataFrame) -> pd.DataFrame:
        # remove hyperlinks
        # src: https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python/11332580
        row['text'] = re.sub(r'\S*https?:\S*', "", row['text'])

        # remove mentions
        row['text'] = re.sub(r'@\w*', "", row['text'])

        # remove hashtags
        row['text'] = re.sub(r'#\w*', "", row['text'])

        return row

    def _convert_sentiment(row: pd.DataFrame) -> pd.DataFrame:
        row['label'] = 0 if row['label'] == 'Neutral' else 1
        return row

    prepared = raw.copy()

    prepared.drop(['id', 'time', 'lang', 'smth'], axis=1, inplace=True)
    prepared.rename(columns={'tweet': 'text', 'sent': 'label'}, inplace=True)

    prepared.apply(_clean_text, axis=1)
    prepared.apply(_convert_sentiment, axis=1)

    return prepared

# Loading the data
train_raw = pd.read_csv('../data/train.csv', sep=',')
test_raw = pd.read_csv('../data/test.csv', sep=',')

train = prepare_data(train_raw)
test = prepare_data(test_raw)

print(train.head(5))

In [None]:
# Get the lists of sentences and their labels.
sentences_train = train.text.values
labels_train = train.label.values.astype(int)

print(sentences_train[0])

### Tokenizing data

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Some code extracted from https://mccormickml.com/2019/07/22/BERT-fine-tuning/

In [None]:
max_len = 0

# For every sentence...
for sent in sentences_train:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    length = tokenizer(sent, add_special_tokens=True, return_attention_mask=False,  return_length=True)['length'][0]

    # Update the maximum sentence length.
    max_len = max(max_len, length)

print('Max sentence length: ', max_len)

In [None]:
input_ids = []
attention_masks = []

for sent in sentences_train:
    encoded_dict = tokenizer(
                        sent,                      # Sentence to encode.
                        truncation=True,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]', True by default
                        max_length = 128,           # Pad & truncate all sentences.
                        padding='max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids_train = torch.cat(input_ids, dim=0)
attention_masks_train = torch.cat(attention_masks, dim=0)
labels_train = torch.tensor(labels_train, dtype=torch.long)

print(input_ids_train[0])
print(attention_masks_train[0])

### Training & Validation Split

In [None]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            # shuffle = True,
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

### Train Our Classification Model

In [None]:
from transformers import AutoModelForSequenceClassification

# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased", # Use the 12-layer BERT model, with a cased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model = model.to(device)

In [None]:
# # Get all of the model's parameters as a list of tuples.
# params = list(model.named_parameters())

# print('The BERT model has {:} different named parameters.\n'.format(len(params)))

# print('==== Embedding Layer ====\n')

# for p in params[0:5]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# print('\n==== First Transformer ====\n')

# for p in params[5:21]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# print('\n==== Output Layer ====\n')

# for p in params[-4:]:
#     print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

### Optimizer & Learning Rate Scheduler

In [None]:
# from transformers import AutoConfig

# config = AutoConfig.from_pretrained('bert-base-cased')

# config

In [None]:
from transformers import AdamW

# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# https://huggingface.co/docs/transformers/main_classes/optimizer_schedules#transformers.AdamW
optimizer = AdamW(model.parameters(),lr = 2e-5)

In [None]:
from transformers import get_scheduler

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
num_epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
num_training_steps = num_epochs * len(train_dataloader)

# Create the learning rate scheduler.
lr_scheduler = get_scheduler(
                            "linear",
                            optimizer=optimizer,
                            num_warmup_steps=0,
                            num_training_steps=num_training_steps
                            )

### Training Preparation

In [None]:
from datasets import load_metric
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return metric.compute(predictions=predictions, references=labels)

metric = load_metric("f1")

In [None]:
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

### Training Loop

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

try:
    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            model.zero_grad()
            parameters = {
                "input_ids" : batch[0].to(device),
                "attention_mask" :  batch[1].to(device), 
                "labels" : batch[2].to(device)
            }
            outputs = model(**parameters)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

except RuntimeError as e:
    print(e)

### Training Curve

In [None]:
# import matplotlib.pyplot as plt
# % matplotlib inline

# import seaborn as sns

# # Use plot styling from seaborn.
# sns.set(style='darkgrid')

# # Increase the plot size and font size.
# sns.set(font_scale=1.5)
# plt.rcParams["figure.figsize"] = (12,6)

# # Plot the learning curve.
# plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
# plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# # Label the plot.
# plt.title("Training & Validation Loss")
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.legend()
# plt.xticks([1, 2, 3, 4])

# plt.show()