# Model Prototyping

## Imports

In [None]:
# Data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities
import os
from tqdm import tqdm

# Machine learning
from sklearn.model_selection import train_test_split

# PyTorch
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

## Prototyping using huggingface transformers

In [None]:
from transformers import AutoModel, AutoTokenizer

In [None]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

# INPUT TWEET IS ALREADY NORMALIZED!
line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"

input_ids = torch.tensor([tokenizer(line, padding="max_length", max_length=64, truncation=True).input_ids])

In [None]:
# print the number of tokens in the input
print(input_ids.shape)
print(input_ids)


In [None]:
print(bertweet(input_ids)[0].shape)

In [None]:
# Pretrained architecture
print(repr(bertweet))

In [None]:
# Create a new model from the pretrained architecture with module
class BERTweetClassifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BERTweetClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/bertweet-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(64*768, 2)

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids):
        features = self.bert(input_ids)[0]
        features = torch.flatten(features, start_dim=1)
        output = self.dropout(features)
        output = self.classifier(output)
        return output

In [None]:
bert_classifier = BERTweetClassifier(freeze_bert=False)
print(repr(bert_classifier))

In [None]:
output = bert_classifier(input_ids)
output

## Try with data from tweets

In [None]:
train_df = pd.read_csv('./train.csv')
train_df.head()

In [None]:
print(train_df.loc[0, "text"])
input_ids = torch.tensor([tokenizer.encode(train_df.loc[0, "text"])])
text_decoded = tokenizer.decode(input_ids[0])
print(text_decoded)

In [None]:
# Generate a tensor of all input ids with padding from tokenizer
encoded_input = tokenizer(list(train_df.text), add_special_tokens=True, padding=True, truncation=True)
input_ids = torch.tensor(encoded_input['input_ids'])
input_ids.shape

### Train, Val, Test split

In [None]:
# Split data into train validaion and test
train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
# Create train and validation dataset and dataloader
class DisasterTweetsDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        self.encoded_input = tokenizer(list(df.text), add_special_tokens=True, padding="max_length", max_length=64, truncation=True)
        self.input_ids = torch.tensor(self.encoded_input['input_ids'])
        self.labels = torch.tensor(df.target.values)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]

train_dataset = DisasterTweetsDataset(train_df, tokenizer)
val_dataset = DisasterTweetsDataset(val_df, tokenizer)
test_datset = DisasterTweetsDataset(test_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_datset, batch_size=32, shuffle=True)

In [None]:

for batch in train_dataloader:
    inputs, labels = batch
    print(inputs.shape)
    print(labels.shape)
    break

In [None]:
# train loop for the model
def train(model, train_dataloader, val_dataloader, epochs=10, lr=1e-5, device='cpu'):
    # Define loss function
    loss_fn = torch.nn.CrossEntropyLoss()
    
    # Define optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # Define scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)
    
    # Define metrics
    metrics = {
        'train_loss': [],
        'val_loss': [],
        'train_acc': [],
        'val_acc': []
    }

    pbar = tqdm(range(epochs), desc="Epochs", position=0, leave=True)

    for epoch, i in enumerate(pbar):
        # Training
        model.train()
        total_loss = 0
        total_correct = 0
        for batch in train_dataloader:
            # Get data
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(input_ids) # (batch_size, 2)
            preds = torch.argmax(outputs, dim=1)

            loss = loss_fn(preds.float(), labels.float())
            loss.requires_grad = True

            # Backward pass
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # Calculate accuracy
            total_correct += torch.sum(preds == labels)
            total_loss += loss.item()

        # Calculate average loss and accuracy
        avg_train_loss = (total_loss / len(train_dataloader))
        avg_train_acc = (total_correct.double() / len(train_dataloader)).item()
        metrics['train_loss'].append(avg_train_loss)
        metrics['train_acc'].append(avg_train_acc)

        # Validation
        model.eval()
        total_loss = 0
        total_correct = 0
        for batch in val_dataloader:
            # Get data
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(input_ids)
            preds = torch.argmax(outputs, dim=1)
            loss = loss_fn(preds.float(), labels.float()) 
            total_loss += loss.item()

            # Calculate accuracy
            total_correct += torch.sum(preds == labels)

        # Calculate average loss and accuracy
        avg_val_loss = (total_loss / len(val_dataloader))
        avg_val_acc = (total_correct.double() / len(val_dataloader)).item()
        metrics['val_loss'].append(avg_val_loss)
        metrics['val_acc'].append(avg_val_acc)

        # Update progress bar with val accuracy and train accuracy
        pbar.set_postfix({'train_acc': avg_train_acc, 'val_acc': avg_val_acc})

        # Update learning rate
        scheduler.step(avg_val_loss)

    # Plot loss and accuracy
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(metrics['train_loss'], label='train')
    plt.plot(metrics['val_loss'], label='val')
    plt.legend()
    plt.title('Loss')
    plt.subplot(1, 2, 2)
    plt.plot(metrics['train_acc'], label='train')
    plt.plot(metrics['val_acc'], label='val')
    plt.legend()
    plt.title('Accuracy')
    plt.show()

    return metrics, model

In [None]:
# TODO : add progress bar, make sure what is logged in mettric is float and not tensor
# test loop for the model
def test(model, test_dataloader, device='cpu'):

    # Define loss function
    loss_fn = torch.nn.CrossEntropyLoss()

    # Define metrics
    metrics = {
        'test_loss': [],
        'test_acc': []
    }

    # Test
    model.eval()
    total_loss = 0
    total_correct = 0
    for batch in tqdm(range(test_dataloader), desc="Testing", position=0, leave=True):
        # Get data
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=1)
        loss = loss_fn(preds.float(), labels.float()) # outputs[0] is the logits
        total_loss += loss.item()

        # Calculate accuracy
        _, preds = torch.max(outputs[0], dim=1)
        total_correct += torch.sum(preds == labels)

    # Calculate average loss and accuracy
    avg_test_loss = total_loss / len(test_dataloader)
    avg_test_acc = (total_correct.double() / len(test_dataloader)).item()
    metrics['test_loss'].append(avg_test_loss)
    metrics['test_acc'].append(avg_test_acc)
    print(f"Test loss {avg_test_loss} accuracy {avg_test_acc}")

    return metrics

In [None]:
# Create the model
bert_classifier = BERTweetClassifier(freeze_bert=False)

# Select device based on availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device used : {device}")
bert_classifier.to(device)

# train the model
metrics, model = train(bert_classifier, train_dataloader, val_dataloader, epochs=10, lr=1e-5, device=device)

In [None]:
# test the model
test_metrics = test(model, test_dataloader, device=device)