# Model Prototyping

## Imports

In [40]:
# Data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split

# PyTorch
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

## Prototyping using huggingface transformers

In [30]:
from transformers import AutoModel, AutoTokenizer

In [67]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

# INPUT TWEET IS ALREADY NORMALIZED!
line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"

input_ids = torch.tensor([tokenizer(line, padding="max_length", max_length=256, truncation=True).input_ids])

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [68]:
# print the number of tokens in the input
print(input_ids.shape)
print(input_ids)


torch.Size([1, 256])
tensor([[    0,  4040,    90,   160,   255, 35006, 26940,  2612,    15,  1456,
             7,   429,  6814,   499, 12952,    10,   156,     5,    22,   866,
            22,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,  

In [69]:
print(bertweet(input_ids).shape)

RuntimeError: The expanded size of the tensor (256) must match the existing size (130) at non-singleton dimension 1.  Target sizes: [1, 256].  Tensor sizes: [1, 130]

In [61]:
tokenizer.decode(input_ids)

'<s> SC has first two presumptive cases of coronavirus, DHEC confirms HTTPURL via @USER : cry : </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [35]:
# Pretrained architecture
print(repr(bertweet))

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(130, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [63]:
# Create a new model from the pretrained architecture with module
class BERTweetClassifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BERTweetClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 2)

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids):
        features = self.bert(input_ids)[0]
        print(f"Shape of features : {features.shape}")
        output = self.dropout(features)
        output = self.classifier(output)
        print(f"Shape of output : {output.shape}")
        return output

In [65]:
bert_classifier = BERTweetClassifier(freeze_bert=False)
print(repr(bert_classifier))

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTweetClassifier(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [None]:
output = bert_classifier(input_ids)

## Try with data from tweets

In [13]:
train_df = pd.read_csv('./train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [14]:
print(train_df.loc[0, "text"])
input_ids = torch.tensor([tokenizer.encode(train_df.loc[0, "text"])])
text_decoded = tokenizer.decode(input_ids[0])
print(text_decoded)

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
<s> Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all </s>


In [15]:
# Generate a tensor of all input ids with padding from tokenizer
encoded_input = tokenizer(list(train_df.text), add_special_tokens=True, padding=True, truncation=True)
input_ids = torch.tensor(encoded_input['input_ids'])
input_ids.shape

torch.Size([7613, 64])

### Train, Val, Test split

In [16]:
# Split data into train validaion and test
train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [17]:
# Create train and validation dataset and dataloader
class DisasterTweetsDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        self.encoded_input = tokenizer(list(df.text), add_special_tokens=True, padding=True, truncation=True)
        self.input_ids = torch.tensor(self.encoded_input['input_ids'])
        self.labels = torch.tensor(df.target.values)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]

train_dataset = DisasterTweetsDataset(train_df, tokenizer)
val_dataset = DisasterTweetsDataset(val_df, tokenizer)
test_datset = DisasterTweetsDataset(test_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_datset, batch_size=32, shuffle=True)

In [24]:
# train loop for the model
def train(model, train_dataloader, val_dataloader, epochs=10, lr=1e-5, device='cpu'):
    # Define loss function
    loss_fn = torch.nn.CrossEntropyLoss()
    
    # Define optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # Define scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)
    
    # Define metrics
    metrics = {
        'train_loss': [],
        'val_loss': [],
        'train_acc': [],
        'val_acc': []
    }
    
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        print("-"*10)
        
        # Training
        model.train()
        total_loss = 0
        total_correct = 0
        for batch in train_dataloader:
            # Get data
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(input_ids)
            loss = loss_fn(outputs[0], labels) # outputs[0] is the logits
            total_loss += loss.detach().item()

            # Backward pass
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # Calculate accuracy
            _, preds = torch.max(outputs[0], dim=1)
            total_correct += torch.sum(preds == labels)

        # Calculate average loss and accuracy
        avg_train_loss = total_loss / len(train_dataloader)
        avg_train_acc = total_correct.double() / len(train_dataloader)
        metrics['train_loss'].append(avg_train_loss)
        metrics['train_acc'].append(avg_train_acc)
        print(f"Train loss {avg_train_loss} accuracy {avg_train_acc}")

        # Validation
        model.eval()
        total_loss = 0
        total_correct = 0
        for batch in val_dataloader:
            # Get data
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(input_ids)
            loss = loss_fn(outputs[0], labels) # outputs[0] is the logits
            total_loss += loss.item()

            # Calculate accuracy
            _, preds = torch.max(outputs[0], dim=1)
            total_correct += torch.sum(preds == labels)

        # Calculate average loss and accuracy
        avg_val_loss = total_loss / len(val_dataloader)
        avg_val_acc = total_correct.double() / len(val_dataset)
        metrics['val_loss'].append(avg_val_loss)
        metrics['val_acc'].append(avg_val_acc)
        print(f"Val loss {avg_val_loss} accuracy {avg_val_acc}")

    # Plot loss and accuracy
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(metrics['train_loss'], label='train')
    plt.plot(metrics['val_loss'], label='val')
    plt.legend()
    plt.title('Loss')
    plt.subplot(1, 2, 2)
    plt.plot(metrics['train_acc'], label='train')
    plt.plot(metrics['val_acc'], label='val')
    plt.legend()
    plt.title('Accuracy')
    plt.show()

    return metrics, model

In [21]:
# test loop for the model
def test(model, test_dataloader, device='cpu'):

    # Define loss function
    loss_fn = torch.nn.CrossEntropyLoss()

    # Define metrics
    metrics = {
        'test_loss': [],
        'test_acc': []
    }

    # Test
    model.eval()
    total_loss = 0
    total_correct = 0
    for batch in test_dataloader:
        # Get data
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(input_ids)
        loss = loss_fn(outputs[0], labels) # outputs[0] is the logits
        total_loss += loss.item()

        # Calculate accuracy
        _, preds = torch.max(outputs[0], dim=1)
        total_correct += torch.sum(preds == labels)

    # Calculate average loss and accuracy
    avg_test_loss = total_loss / len(test_dataloader)
    avg_test_acc = total_correct.double() / len(test_dataloader)
    metrics['test_loss'].append(avg_test_loss)
    metrics['test_acc'].append(avg_test_acc)
    print(f"Test loss {avg_test_loss} accuracy {avg_test_acc}")

    return metrics

In [25]:
# Select device based on availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
bertweet.to(device)

# train the model
metrics, model = train(bertweet, train_dataloader, val_dataloader, epochs=10, lr=1e-5, device=device)

cuda
Epoch 1/10
----------


RuntimeError: Expected target size [32, 768], got [32]