# Distaster tweet classification with HF transformers

## Goal

Predict which Tweets are about real disasters and which ones are not

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import libraries
import torch
import os
import torch.nn as nn

# AutoTokenizer and AutoModelForSequenceClassification will allow to try different model architerctures with minimal changes in code
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler

In [None]:
import random

seed = 142
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

In [None]:
DATA_DIR = os.path.realpath('/kaggle/input/nlp-getting-started')

In [None]:
# Load dataset into dataframe
df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), encoding='utf-8')

In [None]:
df.head()

In [None]:
# Distribution of target

df['target'].value_counts() / df.shape[0]

## Split data into train, validation sets


In [None]:
train_indices, val_indices = train_test_split(df.index, stratify=df['target'], test_size=0.15,random_state=42)

## Text preprocessing

In [None]:
import re
import string

printable_chars = set(string.printable)

def preprocess_text(str_txt: str) -> str:
    """Preprocessing for raw text data"""
    # Remove urls from tweet
    str_txt = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)', '', str_txt, flags=re.MULTILINE)
    # Remove mentions
    str_txt = re.sub(r'@([A-z0-9_]+)', '', str_txt, flags=re.MULTILINE)
    # Remove # from hashtags
    str_txt = re.sub(r'#([A-z0-9_]+)', '\g<1>', str_txt, flags=re.MULTILINE)
    # Convert numbers int NUMBER
    str_txt = re.sub(r'\d+[,.]?(?:\d+)?', 'NUMBER', str_txt, flags=re.MULTILINE)
    # Remove non printable characters
    str_txt = ''.join([ch for ch in str_txt if ch in printable_chars])
    
    return str_txt

In [None]:
# DataFrame before preprocessing
df.head()

In [None]:
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocessing for data frame"""
    df['text'] = df['text'].apply(preprocess_text)
    return df

In [None]:
df = preprocess_df(df)

In [None]:
df.head()

### Dataset and Dataloader

In [None]:
from typing import Union

class DisasterTweetsDataset(Dataset):
    def __init__(self, tweets_df: pd.DataFrame, text_column: str, label_column: str = None) -> None:
        super().__init__()
        self.tweets_df = tweets_df
        self.text_column = text_column
        self.label_column = label_column


    def __len__(self) -> int:
        return self.tweets_df.shape[0]

    def __getitem__(self, idx: int):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        if self.label_column:
            df_row = self.tweets_df.loc[idx, [self.text_column, self.label_column]]
            sample = {'text': df_row[self.text_column], 'label': df_row[self.label_column]}
        else:
            df_row = self.tweets_df.loc[idx, [self.text_column]]
            sample = {'text': df_row[self.text_column]}
        return sample

In [None]:
# SubsetRandomSampler samples elements randomly from a given list of indices, without replacement.

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

In [None]:
tweet_dataset = DisasterTweetsDataset(df, 'text', 'target')
print("Dataset Size:", len(tweet_dataset))

In [None]:
# Example of positive class
print("Text:", tweet_dataset[2]['text'])
print("Label:", tweet_dataset[2]['label'])

In [None]:
# Example of negative class
print("Text:", tweet_dataset[18]['text'])
print("Label:", tweet_dataset[18]['label'])

I am going to use RoBERTa base model with pretrained weights from HuggingFace transformers. More details can be seen [here](https://huggingface.co/roberta-large) 

In [None]:
hf_weights_name = 'roberta-large'
# Create tokenizer from pretrained weights
hf_tokenizer = AutoTokenizer.from_pretrained(hf_weights_name)

In [None]:
# For our case collate_fn is called with a list of data samples at each time. 
# It is expected to collate the input samples into a batch for yielding from the data loader iterator. 

def collate_fn(batch):
    if 'label' in batch[0]:
        texts, labels = zip(*[(batch[i]['text'], batch[i]['label']) for i in range(len(batch))])
        result = dict(labels=labels)
    else:
        texts = [batch[i]['text'] for i in range(len(batch))]
        result = {}
    hf_example_ids = hf_tokenizer.batch_encode_plus(list(texts),
        add_special_tokens=True,
        return_attention_mask=True,
        padding='longest')
    return dict(**result, **hf_example_ids)

In [None]:
import multiprocessing


num_workers = multiprocessing.cpu_count()
batch_size = 8

In [None]:
# Create data loaders for train and validation sets
train_loader = DataLoader(tweet_dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn, sampler=train_sampler)
val_loader = DataLoader(tweet_dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn, sampler=val_sampler)

In [None]:
print(len(train_loader))
print(len(val_loader))

In [None]:
data_loaders = {'train': train_loader, 'val': val_loader}
progress_bars = {}
epoch_stats = {}

### FineTuning procedure

In [None]:
# Check if cuda is available
gpu_available = torch.cuda.is_available()
print("GPU is available:", gpu_available)

In [None]:
# Check 
device = torch.device('cuda' if gpu_available else 'cpu')
print(device)

In [None]:
if gpu_available:
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print("Cuda Device Name:",torch.cuda.get_device_name())

In [None]:
# Create model from pretrained weights
model = AutoModelForSequenceClassification.from_pretrained(hf_weights_name, num_labels=2)
model.to(device);

In [None]:
num_epochs = 4
verbose = True

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup


optimizer = AdamW(model.parameters(), lr=1.5e-6, eps=1e-8)

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader)*num_epochs)

In [None]:
best_acc = 0.0
best_loss = float('inf')

In [None]:
import copy

# Weights of best model so far
best_model_weights = copy.deepcopy(model.state_dict())
epoch_bar = tqdm(desc='training routine', total=num_epochs,
                  initial=0, position=0, disable=(verbose is not True))
for split, data_loader in data_loaders.items():
    progress_bars[split] = tqdm(desc=f'split={split}',
                                total=len(data_loader),
                                position=1,
                                disable=(verbose is not True),
                                leave=True)
    epoch_stats[split] = {'loss': [], 'accuracy': []}

training_data = []
try:
    for epoch in range(1, num_epochs + 1):
        
        for split, data_loader in data_loaders.items():
            epoch_loss = torch.FloatTensor([0.0]).to(device)
            num_correct = torch.LongTensor([0]).to(device)
            total_samples = 0
            is_training = (split == 'train')
            model.train(is_training)
            for batch in data_loader:
                with torch.set_grad_enabled(is_training):
                    input_ids = torch.LongTensor(batch['input_ids']).to(device)
                    labels = torch.LongTensor(batch['labels']).to(device)
                    masks = torch.LongTensor(batch['attention_mask']).to(device)
                    
                    optimizer.zero_grad()

                    outputs = model(input_ids, masks, labels=labels)
                    loss = outputs.loss

                    if is_training:
                        loss.backward()
                    epoch_loss += loss
                    _, predictions = torch.max(outputs.logits, 1)
                    num_correct += torch.eq(predictions, labels).sum()
                    total_samples += labels.size(0)
                    
                    if is_training:
                        optimizer.step()
                        scheduler.step()
                    progress_bars[split].update()
            epoch_loss /= len(data_loader)
            epoch_accuracy = num_correct / total_samples
            epoch_bar.set_postfix({f"{split}_loss": epoch_loss.item(), f"{split}_acc": round(epoch_accuracy.item(), 3)})
            if not is_training:
                training_data.append((epoch_loss.item(), round(epoch_accuracy.item(), 3)))
                if epoch_accuracy.item() > best_acc:
                    best_model_weights = copy.deepcopy(model.state_dict())
                    best_acc = epoch_accuracy.item()

        for bar in progress_bars.values():
            bar.n = 0
            bar.reset()
        epoch_bar.update()
except KeyboardInterrupt:
    pass
finally:
    print(training_data)

In [None]:
# [(0.46483272314071655, 0.792), (0.42263394594192505, 0.813), (0.40105751156806946, 0.83), (0.40114283561706543, 0.83)]

### Training results visualization

In [None]:
import matplotlib.pyplot as plt

In [None]:
training_data

In [None]:
val_losses, val_acc = zip(*training_data)

In [None]:
epochs = list(range(1, num_epochs + 1))
fig, axes = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
axes[0].plot(epochs, val_losses, 'o-')
axes[0].set_ylabel('Val. loss')
axes[1].plot(epochs, val_acc, 'o-', color="orange")
axes[1].set_ylabel('Val. accuracy')

plt.xlabel("epochs")
plt.show()

In [None]:
# Load best model weights
model.load_state_dict(best_model_weights)

In [None]:
# Check best model performance on validation set
model.eval()
num_correct = torch.LongTensor([0]).to(device)
with torch.no_grad():
    for batch in val_loader:
        input_ids = torch.LongTensor(batch['input_ids']).to(device)
        masks = torch.LongTensor(batch['attention_mask']).to(device)
        labels = torch.LongTensor(batch['labels']).to(device)
        
        outputs = model(input_ids, masks)
        _, predictions = torch.max(outputs.logits, 1)
        num_correct += torch.eq(predictions, labels).sum()
print("Val. accuracy of best model:", round(num_correct.item()/len(val_indices), 3))

## Make predictions on test data

In [None]:
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'), encoding='utf-8')
test_df = preprocess_df(test_df)

In [None]:
# Dataset for test data
tweet_dataset_test = DisasterTweetsDataset(test_df, 'text', None)

In [None]:
test_loader = DataLoader(tweet_dataset_test, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn, shuffle=False)
predictions_all = []
model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids = torch.LongTensor(batch['input_ids']).to(device)
        masks = torch.LongTensor(batch['attention_mask']).to(device)
        
        outputs = model(input_ids, masks)
        _, predictions = torch.max(outputs.logits, 1)

        predictions_all.append(predictions)

In [None]:
predictions_all_tensor = torch.cat(predictions_all)

In [None]:
# Predictions for first 10 samples
predictions_all_tensor[:10]

In [None]:
import csv
import uuid


def prepare_submission(test_data: pd.DataFrame, predicted: np.ndarray):
    f_name = f"submissions_{uuid.uuid4()}.csv"
    print(f_name)
    with open(f_name, mode="w") as f:
        csv_writer = csv.DictWriter(f, fieldnames=['id', "target"])
        csv_writer.writeheader()
        for idx, df_row in test_data.iterrows():
            csv_writer.writerow({"id": df_row['id'], "target": predicted[idx]})

In [None]:
prepare_submission(test_df, predictions_all_tensor.cpu().numpy())