### Big thanks to Venelin of Curiousily whose BERT tutorial I used and repurposed to fit this competition. You can find his tutorial [here](https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/)

In [None]:
!pip install -qq transformers

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

import os
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import tensorflow as tf

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
os.listdir('../input/nlp-getting-started')

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
combined = pd.concat([train,test], axis=0)
combined.drop('target',inplace=True, axis=1)
combined.info()

id - a unique identifier for each tweet  
text - the text of the tweet  
location - the location the tweet was sent from (may be blank)  
keyword - a particular keyword from the tweet (may be blank)  
target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)  

No missing values in the most important feature, text, but...  
87 missing values in keyword  
3638 missing values in location

In [None]:
y = train.target.copy()
X = train.drop('target',axis=1)

## Class Imbalance?

In [None]:
sns.countplot(y)
plt.show()

Negatives outnumber positives by ~1000

# Data Preprocessing

## Tokenizing our data

In [None]:
# There's an uncased version too, uncase/case refers to word case, but in the case of tweets, upper case words probably carry more disaster sentiment
pre_trainer = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(pre_trainer)

In [None]:
# Example using sample
sample_txt = combined.text.iloc[0]
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

In [None]:
# Special Token IDs

# SEP - marker for ending a sentence
print(tokenizer.sep_token, tokenizer.sep_token_id)

# CLS - must be added to the start of each sentence so bert knows we're doing a classification task
print(tokenizer.cls_token, tokenizer.cls_token_id)

# PAD - for padding sentences to equal length
print(tokenizer.pad_token, tokenizer.pad_token_id)

# UNK - unknown token
print(tokenizer.unk_token, tokenizer.unk_token_id)

In [None]:
# The above tokens can be added through the encode_plus method
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  truncation=True,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  padding='max_length',
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors but you can return tf tensors or np arrays by setting this to tf or np
)
encoding.keys()
# dict_keys(['input_ids', 'attention_mask'])

Our tweets have been tokenized and padded. We can view what our tokens are tokenizing

In [None]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

# Choosing Sequence Length
How to choose the max padding length

In [None]:
# get token lengths of each text
token_lens = combined.text.apply(lambda x: len(tokenizer.encode(x, truncation = True, max_length=512)))

In [None]:
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count');

It seems all of the texts contain less than 100 tokens so we'll use that as our max length

In [None]:
max_len = 100
batch_size = 16 # dataset is pretty small

# Dataset Creation

In [None]:
class DataGenerator(Dataset):
    def __init__(self, text, targets, tokenizer, max_len):
        self.text = text
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        text = str(self.text[item])
        target = self.targets[item]
        
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True,
            max_length = self.max_len,
            truncation=True,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

def data_loader(df, tokenizer, max_len, batch_size):
    ds = DataGenerator(
        text=df.text.to_numpy(),
        targets=df.target.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4)

In [None]:
train_data_loader = data_loader(train, tokenizer, max_len, batch_size)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

# Model Creation

In [None]:
# First, a small scale example using our encoder that we created for our sample text
bert = BertModel.from_pretrained(pre_trainer)

# last_hidden_state is a sequence of hidden states of the last layer of the model
last_hidden_state, pooled_output = bert(
    input_ids=encoding['input_ids'],
    attention_mask=encoding['attention_mask']
)


In [None]:
last_hidden_state.shape

This is the hidden state of each of our 32 tokens. 768 is the number of hidden units in the feedforward-networks. Our pooled output also has 768 hidden units

In [None]:
pooled_output.shape

Using this info, we can now create a classifier that uses the BERT model

In [None]:
class SentimentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pre_trainer)
        self.drop = nn.Dropout(p=0.5)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
model = SentimentClassifier(2) # 2 classes 1 for disaster and 0 for not
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

In [None]:
torch.nn.functional.softmax(model(input_ids, attention_mask), dim=1)

The output is the probabilities that a text is either a 0 or a 1

# Training

In [None]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

Some recommendations for fine tuning from the BERT paper  
Batch size: 16, 32  
Learning rate (Adam): 5e-5, 3e-5, 2e-5  
Number of epochs: 2, 3, 4

In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    # prevents exploding gradients
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
    
  if train_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = train_acc

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

# Test Predictions

In [None]:
encodes = test.text.apply(lambda x: tokenizer.encode_plus(
            x, 
            add_special_tokens=True,
            max_length = max_len,
            truncation=True,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt'
        ))
input_ids = [i['input_ids'] for i in encodes]
attention_mask = [i['attention_mask'] for i in encodes]

In [None]:
predictions = []
for i, j in zip(input_ids, attention_mask):
    i = i.to(device)
    j = j.to(device)
    output = model(i, j)
    _, prediction = torch.max(output, dim=1)
    predictions.append(prediction.item())

In [None]:
submission = pd.concat([test.id, pd.Series(predictions)], axis=1)
submission.rename(columns = {0:'target'}, inplace=True)
submission.to_csv('submission.csv',index=False)