# Install HuggingFace Transformers package with the pretrained BERT models



In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
!pip install --upgrade transformers emoji datasets gensim
!pip install wget

# Import necessary packages

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import random
import torch 
import torch.nn as nn
from transformers import AutoTokenizer, BertModel

# Runtime environment setup

In [None]:
SEED = 42

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Utility functions

# Loading Twitch dataset

In [None]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from gensim.models import Word2Vec

def train_valid_split_w2v(input_ids, input_w2v, attention_masks, labels, batch_size=32):
    """
    Create DataLoader objects for downstream training
    """
    # Use 80% for training and 20% for validation.
    train_inputs, validation_inputs, train_w2v, validation_w2v, train_masks, validation_masks, train_labels, validation_labels = train_test_split(
        input_ids, input_w2v, attention_masks, labels, random_state=SEED, test_size=0.2, stratify=labels
    )

    print('example train_input:    ', train_inputs[0])
    print('example attention_mask: ', train_masks[0])

    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)

    # Create the DataLoader for our training set.
    train_data = TensorDataset(train_inputs, train_w2v, train_masks, train_labels)

    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(validation_inputs, validation_w2v, validation_masks, validation_labels)
    validation_dataloader = DataLoader(validation_data, shuffle=False, batch_size=batch_size)

    return train_dataloader, validation_dataloader

def train_valid_split_w2v_ext(input_ids, input_w2v, ext_w2v, attention_masks, labels, batch_size=32):
    """
    Create DataLoader objects for downstream training
    """
    # Use 80% for training and 20% for validation.
    train_inputs, validation_inputs, train_w2v, validation_w2v, train_ext, validation_ext, train_masks, validation_masks, train_labels, validation_labels = train_test_split(
        input_ids, input_w2v, ext_w2v, attention_masks, labels, random_state=SEED, test_size=0.2, stratify=labels
    )

    print('example train_input:    ', train_inputs[0])
    print('example attention_mask: ', train_masks[0])

    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)

    # Create the DataLoader for our training set.
    train_data = TensorDataset(train_inputs, train_w2v, train_ext, train_masks, train_labels)

    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(validation_inputs, validation_w2v, validation_ext, validation_masks, validation_labels)
    validation_dataloader = DataLoader(validation_data, shuffle=False, batch_size=batch_size)

    return train_dataloader, validation_dataloader

def train_valid_split_ext(input_ids, input_ext, attention_masks, labels, seed, batch_size=32):
    """
    Create DataLoader objects for downstream training
    input_ext: any additional embeddings
    """
    # Use 80% for training and 20% for validation.
    train_inputs, validation_inputs, train_ext, validation_ext, train_masks, validation_masks, train_labels, validation_labels = train_test_split(
        input_ids, input_ext, attention_masks, labels, random_state=seed, test_size=0.2, stratify=labels
    )

    print('example train_input:    ', train_inputs[0])
    print('example attention_mask: ', train_masks[0])

    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)

    # Create the DataLoader for our training set.
    train_data = TensorDataset(train_inputs, train_ext, train_masks, train_labels)

    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(validation_inputs, validation_ext, validation_masks, validation_labels)
    validation_dataloader = DataLoader(validation_data, shuffle=False, batch_size=batch_size)

    return train_dataloader, validation_dataloader


def train_valid_split(input_ids, attention_masks, labels, batch_size=32):
    # Use 80% for training and 20% for validation.
    train_inputs, validation_inputs,  train_masks, validation_masks, train_labels, validation_labels = train_test_split(
        input_ids, attention_masks, labels, random_state=SEED, test_size=0.2, stratify=labels
    )

    print('example train_input:    ', train_inputs[0])
    print('example attention_mask: ', train_masks[0])

    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)

    # Create the DataLoader for our training set.
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_dataloader = DataLoader(validation_data, shuffle=False, batch_size=batch_size)

    return train_dataloader, validation_dataloader

# Training functions

In [None]:
from torch.optim import AdamW
import time
import datetime
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import f1_score, recall_score

def collect_flat_outputs(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return pred_flat, labels_flat
    
def flat_accuracy(preds, labels):
    pred_flat, labels_flat = collect_flat_outputs(preds, labels)
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def get_optimizer_and_scheduler(model, total_steps, lr=2e-5, weight_decay=0.01):
    # Apply weight decay to all parameters beside the biases or LayerNorm weights
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {
            'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            'weight_decay': weight_decay},
        {
            'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0
        }
    ]
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        # Warmup learning rate for first 10% of training steps
        num_warmup_steps=int(0.10 * total_steps), 
        num_training_steps=total_steps,
    )
    return optimizer, scheduler

def train_model(model,
                epochs,
                train_dataloader,
                validation_dataloader,
                early_stopping_patience = 25):
    # Use GPU, if available
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device) # TPU right now

    # Setup optimizer and LR scheduler 
    total_steps = len(train_dataloader) * epochs
    optimizer, scheduler = get_optimizer_and_scheduler(
        model, total_steps, lr = 2e-5, weight_decay = 0.1
    )

    eval_metrics = {
            'train_loss': [],
            'valid_loss': [],
            'valid_accs': [],
            'f1': [],
            'recall': [],
            'curr_max' : 0
        }

    #loss_values = []
    #eval_accs = []
    curr_max = 0 # current max eval acc

    for epoch in range(0, epochs):
        t0 = time.time()

        total_train_loss = 0
        model.train()

        # ================== ONE EPOCH TRAINING STARTS =========================
        with tqdm(train_dataloader, unit="batch") as train_pbar:
            for batch in train_pbar:
                train_pbar.set_description(f"Training (epoch {epoch + 1})")
                b_input_ids  = batch[0].to(device)
                b_input_mask = batch[-2].to(device)
                b_labels     = batch[-1].to(device)

                model.zero_grad()        

                # Perform a forward pass (evaluate the model on this training batch).
                # This will return the loss because we have provided the `labels`.
                if model.ext_embed_mode == 'GRU':
                    b_input_add_embed = batch[1].to(device)
                    outputs = model(
                        input_ids      = b_input_ids, 
                        attention_mask = b_input_mask, 
                        labels         = b_labels,
                        ext_embed_add  = b_input_add_embed
                    )
                elif model.ext_embed_mode == 'concat':
                    b_input_cat_embed = batch[1].to(device)
                    outputs = model(
                        input_ids        = b_input_ids, 
                        attention_mask   = b_input_mask, 
                        labels           = b_labels,
                        ext_embed_concat = b_input_cat_embed
                    )
                else:
                    ## No external embedding blending in
                    outputs = model(
                        input_ids      = b_input_ids, 
                        attention_mask = b_input_mask, 
                        labels         = b_labels
                    )
                
                # The call to `model` always returns a tuple, so we need to pull the 
                # loss value out of the tuple.
                _, loss = outputs

                # Accumulate the training loss over all of the batches so that we can
                # calculate the average loss at the end. `loss` is a Tensor containing a
                # single value; the `.item()` function just returns the Python value 
                # from the tensor.
                total_train_loss += loss.item()

                # Perform a backward pass to calculate the gradients.
                loss.backward()

                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                # Update parameters and take a step using the computed gradient.
                # The optimizer dictates the "update rule"--how the parameters are
                # modified based on their gradients, the learning rate, etc.
                optimizer.step()

                # Update the learning rate.
                scheduler.step()
        # =================== ONE EPOCH TRAINING ENDS ==========================

        # Calculate the average loss over the training data.
        avg_train_loss = total_train_loss / len(train_dataloader)            
        
        # Store the loss value for plotting the learning curve.
        eval_metrics["train_loss"].append(avg_train_loss)

        print("  * Average training loss: {0:.2f}".format(avg_train_loss))
        print("  * Training epoch took: {:}".format(format_time(time.time() - t0)))
            
        print("Running Validation...")

        t0 = time.time()
        model.eval()

        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        f1 = 0
        epoch_preds, epoch_labels = [], []

        # Evaluate data for one epoch
        # ================== ONE EPOCH VALIDATING STARTS =======================
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids  = batch[0]
            b_input_mask = batch[-2]
            b_labels     = batch[-1]
            
            with torch.no_grad():        
                # Forward pass, calculate logit predictions.
                # Perform a forward pass (evaluate the model on this training batch).
                # This will return the loss because we have provided the `labels`.
                if model.ext_embed_mode == 'GRU':
                    b_input_add_embed = batch[1]
                    outputs = model(
                        input_ids      = b_input_ids, 
                        attention_mask = b_input_mask, 
                        labels         = b_labels,
                        ext_embed_add  = b_input_add_embed
                    )
                elif model.ext_embed_mode == 'concat':
                    b_input_cat_embed = batch[1]
                    outputs = model(
                        input_ids        = b_input_ids, 
                        attention_mask   = b_input_mask, 
                        labels           = b_labels,
                        ext_embed_concat = b_input_cat_embed
                    )
                else:
                    ## No external embedding blending in
                    outputs = model(
                        input_ids      = b_input_ids, 
                        attention_mask = b_input_mask, 
                        labels         = b_labels
                    )
            
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            logits, loss = outputs
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            # Calculate the accuracy for this batch of test sentences.
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            # Collect predictions and labels from each validation batch
            batch_preds, batch_labels = collect_flat_outputs(logits, label_ids)
            epoch_preds.append(batch_preds)
            epoch_labels.append(batch_labels)
            # Accumulate the total accuracy.
            eval_accuracy += tmp_eval_accuracy
            # Accumulate the total loss.
            eval_loss += loss.item()
            # Track the number of batches
            nb_eval_steps += 1
        # =================== ONE EPOCH VALIDATING ENDS ========================
        epoch_preds  = np.concatenate(epoch_preds)
        epoch_labels = np.concatenate(epoch_labels)
        f1 = f1_score(epoch_preds, epoch_labels, average = 'macro')
        recall = recall_score(epoch_preds, epoch_labels, average = 'macro')
        avg_eval_acc  = eval_accuracy / nb_eval_steps
        avg_eval_loss = eval_loss / nb_eval_steps ## average valid. loss over batches
        print("  * Accuracy: {0:.2f}".format(avg_eval_acc))
        print("  * Averaged validation loss: {0:.2f}".format(avg_eval_loss))
        print("  * F1 score: {0:.2f}".format(f1))
        print("  * Validation took: {:}".format(format_time(time.time() - t0)))
        eval_metrics["valid_accs"].append(avg_eval_acc)
        eval_metrics["valid_loss"].append(avg_eval_loss)
        eval_metrics['f1'].append(f1)
        eval_metrics['recall'].append(recall)

        print(f"PREVIOUS BEST EVAL:{curr_max}")
        # save max eval model
        if max(avg_eval_acc, curr_max) == avg_eval_acc:
          curr_max = avg_eval_acc
          eval_metrics["curr_max"] = curr_max
          try:
            os.makedirs("./weights/")
          except:
            print("dir exists")
          print("Saving Weights...")
          model.save_pretrained("./weights/curr")
        

    print("Training complete!")
    return eval_metrics

## Download Twitch data

In [None]:
!pip install gitpython
import git

In [None]:
# The URL for data Github repo.
url = 'https://github.com/konstantinkobs/emote-controlled.git'

# Download the file (if we haven't already)
if os.path.exists('./emote-controlled'):
    print('Already downloaded training data')
else:
    git.Git("./").clone(url)
    print('Done downloading training data')


Already downloaded training data


## Load labeled Twitch messages

In [None]:
labeled_msgs = pd.read_csv("./emote-controlled/data/labeled_dataset.csv",
                           header = 0,
                           names  = ["sentiment",
                                     "date",
                                     "channel",
                                     "game",
                                     "user",
                                     "mod", 	
                                     "subscriber",
                                     "message"])
labeled_msgs['sentiment_relevel'] = (labeled_msgs['sentiment'] + 1).astype('category')
labeled_msgs['message_uncased'] = labeled_msgs.message.str.lower()
print("Number of labeled messages: ", labeled_msgs.shape[0])
# Display 5 random rows from the data.
labeled_msgs.sample(10)

Number of labeled messages:  1922


Unnamed: 0,sentiment,date,channel,game,user,mod,subscriber,message,sentiment_relevel,message_uncased
1259,0,2018-05-22T03:21:17.481Z,xqcow,Overwatch,2475329,False,False,POG,1,pog
1623,1,2018-05-16T19:32:55.000Z,forsen,The Council,2342940,False,True,ALLO ZULUL,2,allo zulul
611,-1,2018-05-17T16:56:40.391Z,moonmoon_ow,Dark Souls III,226947,False,True,I WAS DANCING TO THAT DansGame,0,i was dancing to that dansgame
514,1,2018-05-03T16:36:44.324Z,forsen,IRL,9000,False,True,gachiBASS Clap gachiBASS Clap gachiBASS Clap g...,2,gachibass clap gachibass clap gachibass clap g...
413,1,2018-05-09T04:59:17.521Z,xqcow,Fortnite,295331,False,False,gachiBASS,2,gachibass
1075,1,2018-05-28T19:40:05.478Z,sodapoppin,Bless Online,322720,False,True,"""Premium Membership (180 Days)"" gives 8 in-gam...",2,"""premium membership (180 days)"" gives 8 in-gam..."
538,1,2018-05-23T21:40:22.218Z,forsen,Raft,304456,False,True,forsen1,2,forsen1
482,1,2018-05-30T19:02:31.082Z,forsen,Dark Souls,971480,False,False,CURSED OMEGALUL CURSED OMEGALUL CURSED OMEGALUL,2,cursed omegalul cursed omegalul cursed omegalul
398,-1,2018-05-21T16:26:58.802Z,moonmoon_ow,Dark Souls III,381748,False,True,kill moon moon2N,0,kill moon moon2n
677,0,2018-05-23T21:23:01.103Z,forsen,Raft,261172,False,False,2$ TOP DONATOR OF THE DAY OMEGALUL 2$ TOP DONA...,1,2$ top donator of the day omegalul 2$ top dona...


# Load labeled Twitch emotes

In [None]:
labeled_emotes = pd.read_csv("./emote-controlled/lexica/emote_average.tsv",
                             sep    = '\t',
                             header = 0,
                             names  = ["word", "sentiment"])

labeled_emotes["word_uncased"] = labeled_emotes.word.str.lower()
print("Number of labeled emotes: ", labeled_emotes.shape[0])
# Display 10 random rows from the data.
labeled_emotes.sample(10)

Number of labeled emotes:  100


Unnamed: 0,word,sentiment,word_uncased
91,gachiBASS,0.38,gachibass
54,OMEGALUL,0.43,omegalul
58,POGGERS,0.8,poggers
6,:o,0.06,:o
82,TwitchUnity,0.6,twitchunity
53,NotLikeThis,-0.49,notlikethis
23,DoritosChip,0.01,doritoschip
24,EZ,0.58,ez
3,:D,0.87,:d
45,Kreygasm,0.81,kreygasm


# BERT Classifier

In [None]:
from transformers import BertModel
import torch.nn as nn

class BertForSentenceClassification(BertModel):
    def __init__(self, config,
                 use_dropout = True,
                 ext_embed_mode = None,
                 concat_embed_size = 0,
                 dropout = 0.5):
        '''
        ext_embed_mode: ['concat', 'GRU', None]
        add mode feeds BERT pooler output and word2vec embeddings to a GRU cell
        '''
        super().__init__(config)
        
        self.ext_embed_mode = ext_embed_mode

        self.dropout = nn.Dropout(p = dropout)

        if ext_embed_mode == "GRU":
            # A GRU cell with layer norm

            # W2V input linear layers
            self.Wiz = nn.Linear(config.hidden_size + concat_embed_size, config.hidden_size)
            self.Wir = nn.Linear(config.hidden_size + concat_embed_size, config.hidden_size)
            self.Wih = nn.Linear(config.hidden_size + concat_embed_size, config.hidden_size)

            self.LN = nn.LayerNorm(config.hidden_size)

            # BERT ouput linear layers
            self.Whz = nn.Linear(config.hidden_size, config.hidden_size)
            self.Whr = nn.Linear(config.hidden_size, config.hidden_size)
            self.Whh = nn.Linear(config.hidden_size, config.hidden_size)
        if ext_embed_mode == "concat" or ext_embed_mode == "both":
            self.concat_dense = nn.Sequential(
                nn.Linear(in_features  = concat_embed_size,
                          out_features = concat_embed_size),
                nn.LayerNorm(concat_embed_size),
                nn.GELU()
            )

        self.l1 = nn.Linear(config.hidden_size + concat_embed_size, 256)

        self.layer_norm1 = nn.LayerNorm(256)

        self.a1      = nn.SiLU() ## nn.GELU() ##nn.ReLU()
        self.l2      =  nn.Linear(256, 128)

        self.layer_norm2 = nn.LayerNorm(128)

        self.a2      = nn.SiLU() #nn.GELU() ##nn.ReLU()
        self.l3      = nn.Linear(128, config.num_labels)

        self.loss = torch.nn.CrossEntropyLoss()
        self.use_dropout = use_dropout

    def forward(self,
                labels = None,
                ext_embed_add = None,
                ext_embed_concat = None,
                **kwargs):
        '''
        ext_embed_add: embeddings add to pooler outputs
        ext_embed_concat: embeddings concantenated to pooler outputs
        '''
        outputs = super().forward(**kwargs)

        cls_token_repr = outputs.pooler_output

        if self.ext_embed_mode is not None:
            if self.ext_embed_mode == "concat":
                out_concat = self.concat_dense(ext_embed_concat)
                cls_token_repr = torch.cat(
                    (cls_token_repr, out_concat),
                    dim = -1)
            elif self.ext_embed_mode == "GRU":
                # GRU cell
                z = torch.sigmoid(self.LN(self.Wiz(ext_embed_add)) + self.LN(self.Whz(cls_token_repr)))
                r = torch.sigmoid(self.LN(self.Wir(ext_embed_add)) + self.LN(self.Whr(cls_token_repr)))
                g = torch.tanh(self.LN(self.Wih(ext_embed_add)) + self.LN(self.Whh(r * cls_token_repr)))
                h = (1 - z) * cls_token_repr + z * g

        # apply dropout
        if self.use_dropout:
          dropouts = self.dropout(h)
        else:
          dropouts = h

        linear     = self.layer_norm1(self.l1(dropouts))
        activation = self.a1(linear)
        linear     = self.layer_norm2(self.l2(activation))
        activation = self.a2(linear)
        logits     = self.l3(activation)
        if labels is not None:
            outputs = (logits, self.loss(logits, labels))
        else:
            outputs = (logits,)
        return outputs

# BERTweet Tokenisation

BERTweets has its own tokenizer, so we have to repeat the data loading process

In [None]:
from transformers import AutoTokenizer
pretrained = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(pretrained)

## Prepare inputs

In [None]:
inputs = labeled_msgs.message.values.tolist()
MAX_LEN = max([len(bert_tokenizer(datapoint)['input_ids']) for datapoint in inputs])

print(inputs)
print(len(inputs))

labels = labeled_msgs.sentiment_relevel.values

tokenized_inputs = bert_tokenizer(
    inputs,
    add_special_tokens = True,
    padding = 'max_length',
    max_length = MAX_LEN,
    return_tensors = 'pt',
)

input_ids = tokenized_inputs['input_ids']
attention_masks = tokenized_inputs['attention_mask']

messages = [message.split(' ') for message in labeled_msgs.message]
w2vmodel = Word2Vec(sentences = messages,
                    min_count = 1,
                    vector_size = 768,
                    workers = 4,
                    window = 5,
                    sg = 1,
                    seed = SEED) # train further when adding new prediction data
input_w2v = torch.Tensor(
    [w2vmodel.wv[message].sum(axis = 0) for message in messages]
    )

# Print sentence 0, now as a list of IDs.
#print('Original: ', tokenized_inputs['input_ids'][0])
#print('Original Content: ', labeled_msgs.message[0])
#print(bertweet_tokenizer.convert_tokens_to_ids(bertweet_tokenizer.tokenize(labeled_msgs.message[0])))
#print('* Token IDs:', tokenized_inputs['attention_mask'][0])
#print('* Tokenized:', bertweet_tokenizer.decode(tokenized_inputs['input_ids'][0]))
#print('* Attention_mask', tokenized_inputs['attention_mask'][0])


## Split dataset

In [None]:
bert_train_dataloader, bert_validation_dataloader = train_valid_split_w2v(
    input_ids       = tokenized_inputs['input_ids'],
    input_w2v       = input_w2v,
    attention_masks = tokenized_inputs['attention_mask'],
    labels          = labels,
    batch_size      = 64
)

example train_input:     tensor([  101,  1851,   152, 17145, 12412,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,    

# Load BERT-base

In [None]:
model = BertForSentenceClassification.from_pretrained(
    pretrained,
    num_labels = 3,
    use_dropout = True,
    ext_embed_mode = "add"
#    concat_embed_size = w2v_ext.shape[-1]
)

In [None]:
# Model parameters visualization
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer Layer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

if model.ext_embed_mode == "both" or model.ext_embed_mode == "concat":
    for p in params[-14:]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
elif model.ext_embed_mode == "add":
    for p in params[-6:]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
else:
    for p in params[-4:]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# Freeze Layers

In [None]:
freeze_layers = [model.embeddings, *model.encoder.layer[:5]] # freeze first 4 transformer layers
for layer in freeze_layers:
    for param in layer.parameters():
        param.requires_grad = False
## Check if layers have been frozen
#for param in model.embeddings.parameters():
#    print(param.requires_grad)
#for layer in model.encoder.layer[:5]:
#    for param in layer.parameters():
#        print(param.requires_grad)
#for layer in model.encoder.layer[5:]:
#    for param in layer.parameters():
#        print(param.requires_grad)
#print(list(model.encoder.named_children()))

# Reinitialise Layers

In [None]:
## Reinitialise last layer
model.encoder.layer[-6:].apply(model._init_weights)

# Training

In [None]:
# About 2-3 seconds per epoch using GPU
model_eval = train_model(
    model  = model,
    epochs = 50,
    train_dataloader = bert_train_dataloader,
    validation_dataloader = bert_validation_dataloader
)

In [None]:
import json
with open('w2v_GRU_SILU_freeze4_reinit6_epoch50_lr2e-5_wd01_batch64.json', 'w') as f:
  json.dump(model_eval, f)

In [None]:
try:
  os.makedirs("./weights/")
except:
  print("dir exists")

model.save_pretrained("./weights/curr")

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

def plot_loss_and_acc(loss_vals, eval_accs):
    sns.set(style='darkgrid')
    sns.set(font_scale=1.5)
    plt.rcParams["figure.figsize"] = (12,6)
    fig, ax1 = plt.subplots(1,1)
    ax1.plot(loss_vals, 'b-o', label = 'training loss')
    ax2 = ax1.twinx()
    ax2.plot(eval_accs, 'y-o', label = 'validation accuracy')
    ax2.set_title("Training loss and validation accuracy")
    ax2.set_xlabel("Epoch")
    ax1.set_ylabel("Loss", color='b')
    ax2.set_ylabel("Accuracy", color='y')
    ax1.tick_params(axis='y', rotation=0, labelcolor='b' )
    ax2.tick_params(axis='y', rotation=0, labelcolor='y' )
    plt.show()

def plot_loss(train_loss, valid_loss):
    # Use plot styling from seaborn.
    sns.set(style='darkgrid')
    sns.set(font_scale=1.5)
    plt.rcParams["figure.figsize"] = (12,6)
    plt.plot(train_loss, 'b-o', label="Training")
    plt.plot(valid_loss, 'g-o', label="Validation")
    plt.title("Training & Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

def plot_acc_and_f1(eval_accs, f1):
    sns.set(style='darkgrid')
    sns.set(font_scale=1.5)
    plt.rcParams["figure.figsize"] = (12,6)
    fig, ax1 = plt.subplots(1,1)
    ax1.plot(f1, 'g-o', label = 'training loss')
    ax2 = ax1.twinx()
    ax2.plot(eval_accs, 'k-o', label = 'validation accuracy')
    ax2.set_title("F1 score and Validation accuracy")
    ax2.set_xlabel("Epoch")
    ax1.set_ylabel("F1 score", color='g')
    ax2.set_ylabel("Accuracy", color='k')
    ax1.tick_params(axis='y', rotation=0, labelcolor='g' )
    ax2.tick_params(axis='y', rotation=0, labelcolor='k' )
    plt.show()


In [None]:
plot_loss_and_acc(model_eval["train_loss"], model_eval["valid_accs"])

In [None]:
plot_loss(model_eval["train_loss"], model_eval["valid_loss"])

In [None]:
plot_acc_and_f1(model_eval["valid_accs"], model_eval["f1"])

In [None]:
def predict(inputs, tokenizer, model):
  with torch.no_grad():
    max_len = max([len(tokenizer(datapoint)['input_ids']) for datapoint in inputs])
    tokenized_inputs = tokenizer(
      inputs.tolist(),          # Input text
      add_special_tokens=True,  # add '[CLS]' and '[SEP]'
      padding='max_length',     # pad to a length specified by the max_length
      max_length=max_len,       # truncate all sentences longer than max_length
      return_tensors='pt',      # return everything we need as PyTorch tensors
    )
    output = model(input_ids = tokenized_inputs['input_ids'])
    output = nn.functional.softmax(output[0]).detach().numpy()
    output = output.argmax(axis = 1)
  return output
