In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

import os
import transformers
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import tensorflow as tf

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 63
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
combined = pd.concat([train,test], axis=0)
combined.drop('target',inplace=True, axis=1)
combined.info()

* d - a unique identifier for each tweet
* text - the text of the tweet
* location - the location the tweet was sent from (may be blank)
* keyword - a particular keyword from the tweet (may be blank)
* target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)
* 
* No missing values in the most important feature, text, but...
* 87 missing values in keyword
* 3638 missing values in location

In [None]:
y = train.target.copy()
X = train.drop('target',axis=1)
path = '../input/roberta-transformers-pytorch/roberta-base'

# Class Imbalance?

In [None]:
sns.countplot(y)
plt.show()



Negatives outnumber positives by ~1000

# First, we have to sort our data by sequence length. We'll tokenize our data and plot the length of these tokens first.Negatives outnumber positives by ~1000


First, we have to sort our data by sequence length. We'll tokenize our data and plot the length of these tokens first.

In [None]:
train_targets = train.target.values.tolist()
max_len = 90

tokenizer = AutoTokenizer.from_pretrained('../input/roberta-transformers-pytorch/roberta-base')

input_ids = [tokenizer.encode(
        text=i,           
        add_special_tokens=True, 
        max_length=max_len,  
        truncation=True,     
        padding=False
    ) for i in train.text]

In [None]:
unsorted_lengths = [len(x) for x in input_ids]
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='darkgrid')

sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

plt.scatter(range(0, len(unsorted_lengths)), unsorted_lengths, marker="|")

plt.xlabel('Sample Number')
plt.ylabel('Sequence Length')
plt.title('Samples BEFORE Sorting')

plt.show()



What a mess. Now we'll sort the tokens by length

In [None]:
sorted_input_ids = sorted(zip(input_ids, train_targets), key=lambda x: len(x[0]))
print('Shortest sample:', len(sorted_input_ids[0][0]))
print('Longest sample:', len(sorted_input_ids[-1][0]))
sorted_lengths = [len(s[0]) for s in sorted_input_ids]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

plt.plot(range(0, len(sorted_lengths)), sorted_lengths)

plt.xlabel('Sample Number')
plt.ylabel('Sequence Length')
plt.title('Samples after Sorting')

plt.show()

# Random Batch Selection

In [None]:
batch_size = 32
import random

batch_ordered_sentences = []
batch_ordered_labels = []

print('Creating training batches of size {:}'.format(batch_size))

while len(sorted_input_ids) > 0:  
    if ((len(batch_ordered_sentences) % 50) == 0):
        print('  Selected {:,} batches.'.format(len(batch_ordered_sentences)))

    to_take = min(batch_size, len(sorted_input_ids))
    select = random.randint(0, len(sorted_input_ids) - to_take)
    batch = sorted_input_ids[select:(select + to_take)]
    batch_ordered_sentences.append([s[0] for s in batch])
    batch_ordered_labels.append([s[1] for s in batch])
    del sorted_input_ids[select:select + to_take]

print('\n  DONE - {:,} batches.'.format(len(batch_ordered_sentences)))



Remember, the batches are still not ordered according to length


# Padding

In [None]:
inputs = []
attn_masks = []
targets = []

for (batch_inputs, batch_labels) in zip(batch_ordered_sentences, batch_ordered_labels):
    batch_padded_inputs = []
    batch_attn_masks = []
    max_size = max([len(sen) for sen in batch_inputs])
    for sen in batch_inputs:
        num_pads = max_size - len(sen)
        padded_input = sen + [tokenizer.pad_token_id]*num_pads
        attn_mask = [1] * len(sen) + [0] * num_pads
        batch_padded_inputs.append(padded_input)
        batch_attn_masks.append(attn_mask)
    inputs.append(torch.tensor(batch_padded_inputs))
    attn_masks.append(torch.tensor(batch_attn_masks))
    targets.append(torch.tensor(batch_labels))

# Comparison

In [None]:
train_text = train.text.values.tolist()
padded_lengths = [len(s) for batch in inputs for s in batch]
smart_token_count = np.sum(padded_lengths)
fixed_token_count = len(train_text) * max_len

prcnt_reduced = (fixed_token_count - smart_token_count) / float(fixed_token_count) 

print('Total tokens:')
print('   Fixed Padding: {:,}'.format(fixed_token_count))
print('  Smart Batching: {:,}  ({:.2%} less)'.format(smart_token_count, prcnt_reduced))

# Putting it all together


SmartBatchingDataset stores samples by tokenizing text and converting to sequences


In [None]:
# Essential Imports
import random
import numpy as np
import multiprocessing
import more_itertools

import torch
import torch.nn as nn
from torch.utils.data import Sampler, Dataset, DataLoader

In [None]:
class SmartBatchingDataset(Dataset):
    def __init__(self, df, tokenizer):
        super(SmartBatchingDataset, self).__init__()
        self._data = (
            f"{tokenizer.bos_token} " + df.text + f" {tokenizer.eos_token}" 
        ).apply(tokenizer.tokenize).apply(tokenizer.convert_tokens_to_ids).to_list()
        self._targets = None
        if 'target' in df.columns:
            self._targets = df.target.tolist()
        self.sampler = None

    def __len__(self):
        return len(self._data)

    def __getitem__(self, item):
        if self._targets is not None:
            return self._data[item], self._targets[item]
        else:
            return self._data[item]

    def get_dataloader(self, batch_size, max_len, pad_id):
        self.sampler = SmartBatchingSampler(
            data_source=self._data,
            batch_size=batch_size
        )
        collate_fn = SmartBatchingCollate(
            targets=self._targets,
            max_length=max_len,
            pad_token_id=pad_id
        )
        dataloader = DataLoader(
            dataset=self,
            batch_size=batch_size,
            sampler=self.sampler,
            collate_fn=collate_fn,
            num_workers=(multiprocessing.cpu_count()-1),
            pin_memory=True
        )
        return dataloader



SmartBatchingSampler sorts sequences by length, make batches of specified size, shuffle the batch, then return indices


In [None]:
class SmartBatchingSampler(Sampler):
    def __init__(self, data_source, batch_size):
        super(SmartBatchingSampler, self).__init__(data_source)
        sample_lengths = [len(seq) for seq in data_source]
        argsort_inds = np.argsort(sample_lengths)
        batches = list(more_itertools.chunked(argsort_inds, n=batch_size))
        if batches:
            last_batch = batches.pop(-1)
            np.random.shuffle(batches)
            batches.append(last_batch)
        self._inds = list(more_itertools.flatten(batches))
        self._backsort_inds = None
    
    def __iter__(self):
        it = iter(self._inds)
        return it

    def __len__(self):
        return len(self._inds)
    
    @property
    def backsort_inds(self):
        if self._backsort_inds is None:
            self._backsort_inds = np.argsort(self._inds)
        return self._backsort_inds



SmartBatchingCollate adds padding up to max_length, make attention masks, and targets for each sample batch


In [None]:
class SmartBatchingCollate:
    def __init__(self, targets, max_length, pad_token_id):
        self._targets = targets
        self._max_length = max_length
        self._pad_token_id = pad_token_id
        
    def __call__(self, batch):
        if self._targets is not None:
            sequences, targets = list(zip(*batch))
        else:
            sequences = list(batch)
        
        input_ids, attention_mask = self.pad_sequence(
            sequences,
            max_sequence_length=self._max_length,
            pad_token_id=self._pad_token_id
        )
        
        if self._targets is not None:
            output = input_ids, attention_mask, torch.tensor(targets)
        else:
            output = input_ids, attention_mask
        return output
    
    def pad_sequence(self, sequence_batch, max_sequence_length, pad_token_id):
        max_batch_len = max(len(sequence) for sequence in sequence_batch)
        max_len = min(max_batch_len, max_sequence_length)
        padded_sequences, attention_masks = [[] for i in range(2)]
        attend, no_attend = 1, 0
        for sequence in sequence_batch:
            # As discussed above, truncate if exceeds max_len
            new_sequence = list(sequence[:max_len])
            
            attention_mask = [attend] * len(new_sequence)
            pad_length = max_len - len(new_sequence)
            
            new_sequence.extend([pad_token_id] * pad_length)
            attention_mask.extend([no_attend] * pad_length)
            
            padded_sequences.append(new_sequence)
            attention_masks.append(attention_mask)
        
        padded_sequences = torch.tensor(padded_sequences)
        attention_masks = torch.tensor(attention_masks)
        return padded_sequences, attention_masks

In [None]:
dataset = SmartBatchingDataset(train, tokenizer)
train_data_loader = dataset.get_dataloader(batch_size=24, max_len=max_len, pad_id=tokenizer.pad_token_id)

In [None]:
padded_lengths = []
for batch_idx, (input_ids, attention_mask, targets) in enumerate(train_data_loader):
    for s in input_ids:
        padded_lengths.append(len(s))

smart_token_count = np.sum(padded_lengths)
fixed_token_count = len(train_text) * max_len

prcnt_reduced = (fixed_token_count - smart_token_count) / float(fixed_token_count) 

print('Total tokens:')
print('   Fixed Padding: {:,}'.format(fixed_token_count))
print('  Smart Batching: {:,}  ({:.2%} less)'.format(smart_token_count, prcnt_reduced))



Pytorch dataloader implementation saved a little more memory


# Model Creation


In [None]:
class SentimentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.roberta = AutoModel.from_pretrained(path)
        self.config = self.roberta.config
        self.layer_norm = nn.LayerNorm(self.config.hidden_size)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.config.hidden_size, n_classes)
        
        
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(
        input_ids=input_ids,
        attention_mask=attention_mask
        )
        sequence_output = outputs[1]
        sequence_output = self.layer_norm(sequence_output)
        output = self.drop(sequence_output)
        return self.out(output)

In [None]:
model = SentimentClassifier(2) # 2 classes 1 for disaster and 0 for not
model = model.to(device)

In [None]:
# torch.nn.functional.softmax(model(input_ids, attention_mask), dim=1)

# Training

In [None]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), betas = (0.99, 0.98), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

* Some recommendations for fine tuning from the BERT paper
* Batch size: 16, 32
* Learning rate (Adam): 5e-5, 3e-5, 2e-5
* Number of epochs: 2, 3, 4



train_data_loader orders its data like this: input_ids, attention_mask, targets


In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d[0].to(device)
    attention_mask = d[1].to(device)
    targets = d[2].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    # prevents exploding gradients
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
        input_ids = d[0].to(device)
        attention_mask = d[1].to(device)
        targets = d[2].to(device)
        outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val = train_test_split(train, test_size=0.10, random_state=RANDOM_SEED)

train_dataset = SmartBatchingDataset(X_train, tokenizer)
train_data_loader = train_dataset.get_dataloader(batch_size=24, max_len=max_len, pad_id=tokenizer.pad_token_id)

val_dataset = SmartBatchingDataset(X_val, tokenizer)
val_data_loader = val_dataset.get_dataloader(batch_size=24, max_len=max_len, pad_id=tokenizer.pad_token_id)

history = defaultdict(list)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(X_train)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)

    val_acc, val_loss = eval_model(
      model,
      val_data_loader,
      loss_fn,
      device,
      len(X_val)
    )
    print(f'Val loss {val_loss} accuracy {val_acc}')
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

In [None]:
# plt.plot(history['train_acc'], label='train accuracy')
# plt.plot(history['val_acc'], label = 'val accurracy')
# plt.title('Training history')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend()
# plt.ylim([0, 1]);

# Testing


In [None]:
encodes = test.text.apply(lambda x: tokenizer.encode_plus(
            x, 
            add_special_tokens=True,
            max_length = max_len,
            truncation=True,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt'
        ))
input_ids = [i['input_ids'] for i in encodes]
attention_mask = [i['attention_mask'] for i in encodes]

In [None]:
predictions = []
for i, j in zip(input_ids, attention_mask):
    i = i.to(device)
    j = j.to(device)
    output = model(i, j)
    _, prediction = torch.max(output, dim=1)
    predictions.append(prediction.item())

predictions = []
for i, j in zip(input_ids, attention_mask):
    i = i.to(device)
    j = j.to(device)
    output = model(i, j)
    _, prediction = torch.max(output, dim=1)
    predictions.append(prediction.item())

In [None]:
submission = pd.concat([test.id, pd.Series(predictions)], axis=1)
submission.rename(columns = {0:'target'}, inplace=True)
submission.to_csv('submission.csv',index=False)