# Introduction

This notebook can use `sentence-transformers/bert-base-nli-mean-tokens` or `cardiffnlp/twitter-roberta-base` or `roberta-base`

## References
1. [roBERTA-base + PyTorch for Sent. Classification](https://www.kaggle.com/bumjunkoo/roberta-for-sentiment-classification)
2. [NLP with Disaster Tweet](https://www.kaggle.com/theblackmamba31/nlp-with-disaster-tweet)
3. [Balanced Sampling between classes with torchvision DataLoader](https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/3)

# Imports

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing
import torch
import torch.nn as nn
import torch.optim as optim
import emoji
import re
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

# Data Loading

We will now load our data into different DataFrames.

In [None]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv", sep=",")
test_df = pd.read_csv("../input/nlp-getting-started/test.csv", sep=",")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
print(f"Length of training data: {len(train_df)}")
print(f"Length of testing data: {len(test_df)}")

In [None]:
print(f"Missing data in training:\n{train_df.isnull().sum()}")
print("-" * 20)
print(f"Missing data in testing:\n{test_df.isnull().sum()}")

Cleaning the data.

We will clean the tweets for each entry in text.

In [None]:
def clean_tweet(txt):
    txt = re.sub(r'@[A-Za-z0-9_]+','',txt)
    txt = re.sub(r'#','',txt)
    txt = re.sub(r'RT : ','',txt)
    txt = re.sub(r'\n','',txt)
    # to remove emojis
    txt = re.sub(emoji.get_emoji_regexp(), r"", txt)
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+','',txt)
    txt = re.sub(r"https?://\S+|www\.\S+","",txt)
    txt = re.sub(r"<.*?>","",txt)
    return str.lower(txt)

In [None]:
tqdm.pandas()

print(train_df.iloc[0, -2])
train_df.text = train_df.text.progress_apply(clean_tweet)
print(train_df.iloc[0, -2])

In [None]:
print(test_df.iloc[0, -1])
test_df.text = test_df.text.progress_apply(clean_tweet)
print(test_df.iloc[0, -1])                            

# Create Dataset

In [None]:
# model_name = "sentence-transformers/bert-base-nli-mean-tokens"
model_name = "cardiffnlp/twitter-roberta-base"
# model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def encode_sentences(sentences):
    encoded = tokenizer(sentences, padding=True, return_attention_mask=True, return_tensors='pt')
    return encoded

In [None]:
def collate_batch(batch):
    sentences, targets = list(zip(*batch))
    encoded = encode_sentences(list(sentences))
    targets = torch.tensor(targets)
    return encoded, targets

class DisasterDataset(torch.utils.data.Dataset):    
    def __init__(self, df):
        self.df = df.text.to_list()
        self.targets = df.target.to_list()
     
    def __getitem__(self, idx):
        sentence = self.df[idx]
        target = self.targets[idx]
        return sentence, target
     
    def __len__(self):
        return len(self.df)

In [None]:
dataset = DisasterDataset(train_df)

In [None]:
# Display text and label.
print('\nFirst iteration of data set: ', next(iter(dataset)), '\n')
# Print how many items are in the data set
print('Length of data set: ', len(dataset), '\n')

# Create Model

In [None]:
class Model(nn.Module):
    def __init__(self, model_name, num_classes):
        super(Model, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size
        self.classify = nn.Sequential(
            nn.LayerNorm(hidden_size),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, num_classes)
        )
        
    def mean_pooling(self, outputs, attention_mask):
        token_embeddings = outputs[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def forward(self, inputs):
        outputs = self.encoder(**inputs)
        return self.classify(outputs[1])

In [None]:
model = Model(model_name, 2)
model = model.to(device)

In [None]:
with torch.no_grad():
    example_sentence = "On the plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE"
    example_enc = encode_sentences([example_sentence]).to(device)
    example_output = model(example_enc)
    print(example_output)

In [None]:
sns.countplot(train_df.target, data=train_df)
plt.show()

We will try to balance the dataset

In [None]:
def make_weights_for_balanced_classes(targets, n_classes):                        
    count = [0] * n_classes                                                      
    for t in targets:                                                         
        count[t] += 1                                                     
    weight_per_class = [0.] * n_classes                                      
    N = float(sum(count))                                                   
    for i in range(n_classes):                                                   
        weight_per_class[i] = N / float(count[i])                                 
    weight = [0] * len(targets)    
    for idx, val in enumerate(targets):                                          
        weight[idx] = weight_per_class[val]                                  
    return weight 

In [None]:
def prepare_dataloaders(train_df):
    X_train, X_val = train_test_split(train_df, test_size=0.10, random_state=0, stratify=train_df.target)
    
#     weights = make_weights_for_balanced_classes(X_train.target, 2)
#     sampler = torch.utils.data.sampler.WeightedRandomSampler(torch.DoubleTensor(weights), len(weights))
    
    train_dataset = DisasterDataset(X_train)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, collate_fn=collate_batch, num_workers=(multiprocessing.cpu_count() - 1), pin_memory=True)

    val_dataset = DisasterDataset(X_val)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=64, collate_fn=collate_batch, num_workers=(multiprocessing.cpu_count() - 1), pin_memory=True)

    return train_dataloader, val_dataloader

In [None]:
train_dataloader, val_dataloader = prepare_dataloaders(train_df)

# Training

In [None]:
EPOCHS = 5
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), betas = (0.99, 0.98), lr=2e-5)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

In [None]:
def train(model, dataloader, loss_fn, optimizer, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0
    for sentences, targets in tqdm(dataloader):
        input_ids = sentences["input_ids"].to(device)
        attention_mask = sentences["attention_mask"].to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()        
        outputs = model(dict(
            input_ids=input_ids,
            attention_mask=attention_mask
        ))
        
        train_loss = loss_fn(outputs, targets)
        train_loss.backward()
        losses.append(train_loss.item())
        
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == targets)
        
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    return correct_predictions.double() / len(dataloader.dataset), np.mean(losses)

In [None]:
def validate(model, dataloader, loss_fn):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for sentences, targets in tqdm(dataloader):
            input_ids = sentences["input_ids"].to(device)
            attention_mask = sentences["attention_mask"].to(device)
            targets = targets.to(device)
            
            outputs = model(dict(
                input_ids=input_ids,
                attention_mask=attention_mask
            ))
            
            val_loss = loss_fn(outputs, targets)
            losses.append(val_loss.item())
            
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == targets)
            
    return correct_predictions.double() / len(dataloader.dataset), np.mean(losses)

In [None]:
for epoch in range(EPOCHS):
    print(f"Epoch: {epoch + 1} / {EPOCHS}")
    
    train_accuracy, train_loss = train(model, train_dataloader, loss_fn, optimizer, scheduler)
    val_accuracy, val_loss = validate(model, val_dataloader, loss_fn)
    
    print(f"Training Loss: {train_loss} | Training Accuracy: {train_accuracy}")
    print(f"Validation Loss: {val_loss} | Validation Accuracy: {val_accuracy}")    

# Submission

In [None]:
model.eval()
with torch.no_grad():
    test_encoded = encode_sentences(test_df.text.to_list())
    input_ids = test_encoded["input_ids"].to(device)
    attention_mask = test_encoded["attention_mask"].to(device)
    predictions = model(dict(
            input_ids=input_ids,
            attention_mask=attention_mask
        ))

In [None]:
predictions = np.argmax(predictions.cpu(), axis=1)

In [None]:
submission = pd.concat([test_df.id, pd.Series(predictions)], axis=1)
submission.rename(columns = {0:'target'}, inplace=True)
submission.to_csv('submission.csv',index=False)