## Disaster or not: Text Classification using BERT

In [None]:
!wget https://github.com/ravi-ilango/acm-dec-2020-nlp/blob/main/lab3/disaster_data.zip?raw=true -O disaster_data.zip

!unzip disaster_data.zip


In [None]:
!pip install transformers

In [None]:
# imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig, AdamW

from tqdm import trange

import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
model_path = './bert_disaster_detection_state_dict.pth'

### Load data

In [None]:
#
# queries are stored in the variable query_text
# correct intent labels are stored in the variable labels
#
query_text = pd.read_csv('./disaster_data/train.csv').text.values
labels = pd.read_csv('./disaster_data/train.csv').target.values

print(query_text.shape)

In [None]:
plt.hist(labels)
plt.xlabel('target')
plt.ylabel('count')
plt.title('target distribution')
plt.xticks(np.arange(len(np.unique(labels))));

### Prepare Data: Convert to tokens and add special tokens [CLS] and [SEP]

In [None]:
# add special tokens for BERT to work properly
sentences = ["[CLS] " + query + " [SEP]" for query in query_text]
print(sentences[1])

# Tokenize with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[1])

### Prepare Data: Pad tokens to create sequences of constant length

In [None]:
# Maximum sequence length. 
MAX_LEN = 100

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


### Prepare Data
Create attention masks

In [None]:
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

### Split into train and validation datasets

In [None]:
# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)


In [None]:
train_inputs.shape

### Convert data into torch tensors

In [None]:
# Convert data into torch tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

### Create a data generator (iterator) for the train and validation data

In [None]:
batch_size = 32

# Create an iterator of train data with torch DataLoader 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create an iterator of validation data with torch DataLoader 
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

### Load pretrained BERT model


In [None]:
# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device.type == 'cuda':
    print (torch.cuda.get_device_name(0))

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

num_labels = 2

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                      num_labels=num_labels)
model.to(device)

# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters,
                     lr=2e-5)


In [None]:
if os.path.exists(model_path):
    print ("Loading weights from saved model...")
    model.load_state_dict(torch.load(model_path, map_location=device))


### Model Train function

In [None]:
def train(model, iterator, optimizer):

    model.train()  

    epoch_loss = 0

    for step, batch in enumerate(iterator):
        
        #retrieve input data
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        #resets the gradients after every batch
        optimizer.zero_grad()

        # Forward pass
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = output['loss']
        
        # Backward pass
        loss.backward()
        
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        
        # loss
        epoch_loss += loss.item()

        if step%50==0:
            print (f"step: {step}")

    
    return epoch_loss / len(iterator)

### Model Evaluate function

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Evaluate
def evaluate(model, iterator):
    
    #initialize every epoch
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    for batch in iterator:
        
        #retrieve input data
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        #deactivates autograd
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = output['logits']

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)    
        epoch_acc += tmp_eval_accuracy
        
    return epoch_acc / len(iterator)

### Train and Validate
This step takes ~3min


In [None]:
N_EPOCHS = 1
best_valid_acc = 0

# BERT training loop
for _ in trange(N_EPOCHS, desc="Epoch"):  

    #train the model
    train_loss = train(model, train_dataloader, optimizer)
    
    #evaluate the model
    valid_acc = evaluate(model, validation_dataloader)
    
    #save the best model
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        print ("saving model ...")
        torch.save(model.state_dict(), model_path)
    
    print(f'\t Train Loss: {train_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

### Predict

In [None]:
#load weights
model.to('cpu')
model.load_state_dict(torch.load(model_path))
model.eval()

#inference
def prepare_text(sentence, MAX_LEN = 150):
    sentence = "[CLS] " + sentence + " [SEP]"
    
    # Tokenize with BERT tokenizer
    tokenized_text = tokenizer.tokenize(sentence)
    
    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    input_ids = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    
    # Create attention masks
    attention_masks = []
    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    
    return input_ids, attention_masks

def predict(model, sentence):
    
    input_ids, attention_masks = prepare_text(sentence)
    
    output = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
    logits = output['logits']

    return logits.detach().cpu().numpy()

def disaster_or_not(logits):
    pred = np.argmax(logits, axis=1)[0]
    return 'Disaster' if pred == 1 else 'Not a disaster'

In [None]:
logits = predict(model, "Forest fire near La Ronge Sask. Canada")
disaster_or_not(logits)

In [None]:
logits = predict(model, "The weather is awesome")
disaster_or_not(logits)