In [None]:
!pip install transformers

# Imports

In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
import json
from sklearn.metrics import precision_recall_fscore_support

# nltk imports for preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Hugging face Transformers Import
import transformers
from transformers import BertModel, BertTokenizerFast
from transformers import DistilBertModel, DistilBertTokenizerFast, AdamW

# Setting Seed

In [None]:
def set_seed(seed=None, seed_torch=True):
    if seed is None:
        seed = np.random.choice(2 ** 32)
    random.seed(seed)
    np.random.seed(seed)
    if seed_torch:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    print(f'Random seed {seed} has been set.')
    
set_seed(seed=101)

# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Preprocessing Data

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# downloading stopwords
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
print(f"stopwords: {STOPWORDS[:5]}")

porter = PorterStemmer()

In [None]:
# Text Preprocessing Function
def preprocess(text, stopwords=STOPWORDS):
    """Conditional preprocessing on our text unique to our task."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub("", text)

    # Remove words in paranthesis
    text = re.sub(r"\([^)]*\)", "", text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    return text

In [None]:
train_df.text = train_df.text.apply(preprocess)
test_df.text = test_df.text.apply(preprocess)

In [None]:
train_df['text'][0]

# BERT Tokenizer

In [None]:
# Downloading pretrained BERT tokenizer
# model_name = 'bert-base-cased'
# tokenizer = BertTokenizerFast.from_pretrained(model_name)

model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [None]:
# get length of all the messages in the train set
seq_len = [len(i.split()) for i in train_df["text"].tolist()]

pd.Series(seq_len).hist(bins = 30)

# Dataset Class

In [None]:
class TweetDataset(Dataset):
    def __init__(self, df, tokenizer, istrain=True):
        self.text = df["text"].tolist()
        self.tokenizer = tokenizer
        self.istrain = istrain
        
        if self.istrain:
            self.labels = df['target'].tolist()
            
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = self.text[index]
        
        if self.istrain:
            label = self.labels[index]
            return text, label
        
        return text
    
    def collate_fn(self, batch):
        """Processing on a batch"""
        # Getting Input
        batch = np.array(batch)
        
        if self.istrain:
            texts = batch[:,0]
        else:
            texts = batch

        # tokenizing text inputs
        tokenized_text = self.tokenizer(texts.tolist(), return_tensors='pt', max_length=25, padding='max_length', truncation=True)
        
        if self.istrain:
            target = batch[:, 1]
            target = torch.LongTensor(target.astype(np.int32))
            return tokenized_text, target

        return tokenized_text

    def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
        dataloader = DataLoader(dataset=self, batch_size=batch_size, collate_fn=self.collate_fn, shuffle=shuffle, drop_last=drop_last, pin_memory=True)
        return dataloader

In [None]:
len(train_df)

In [None]:
new_train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)

In [None]:
# creating Training Dataset and DataLoader
train_dataset = TweetDataset(new_train_df, tokenizer=tokenizer, istrain=True)
train_dataloader = train_dataset.create_dataloader(batch_size=32, shuffle=True)

# creating Validation Dataset and DataLoader
val_dataset = TweetDataset(val_df, tokenizer=tokenizer, istrain=True)
val_dataloader = val_dataset.create_dataloader(batch_size=32, shuffle=True)

# creating Test Dataset
test_dataset = TweetDataset(test_df, tokenizer=tokenizer, istrain=False)
test_dataloader = test_dataset.create_dataloader(batch_size=32, shuffle=False)

test_dataset[0]

In [None]:
len(test_dataloader)

In [None]:
# batch = next(iter(train_dataloader))
# tokenized_text = batch[:-1]

# tokenized_text['input_ids'] = tokenized_text['input_ids'].to(device)
# tokenized_text['attention_mask'] = tokenized_text['attention_mask'].to(device)
# tokenized_text

# BERT Model

In [None]:
class Model(nn.Module):
    def __init__(self, num_classes, freeze_bert=True):
        super(Model, self).__init__()
        
        self.bert_model = DistilBertModel.from_pretrained(model_name)
        self.bi_lstm = nn.LSTM(768, 256, batch_first=True, num_layers=1, bidirectional=True, dropout=0.4)
        self.fc_1 = nn.Linear(2*256, 128)
        self.fc_final = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)
        
        if freeze_bert:
            for param in self.bert_model.parameters():
                param.requires_grad = False
        else:
            for param in self.bert_model.parameters():
                param.requires_grad = True
                
    def forward(self, text_seq):
        bert_output = self.bert_model(**text_seq)
        x = bert_output['last_hidden_state']
        x, _ = self.bi_lstm(x)
        x = x[:, 0]
        x = F.relu(self.fc_1(x))
        x = self.dropout(x)
        x = self.fc_final(x)
        return x

In [None]:
model = Model(num_classes=2, freeze_bert=False)
model = model.to(device)

In [None]:
model

# Training Model

In [None]:
for param in model.parameters():
    param.requires_grad = True

In [None]:
LEARNING_RATE=1e-5
patience = 8
NUM_EPOCHS = 50

# Define Loss
loss_fn = nn.CrossEntropyLoss().to(device)

# optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# learning rate schedular
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)

In [None]:
# Model training
from tqdm.notebook import tqdm

save_path='Bert-Model-1.pt'

results = {
    'train_losses': [],
    'valid_losses': [],
    'train_acc': [],
    'valid_acc': [],
}

best_val_loss = np.inf

for epoch in range(NUM_EPOCHS):
    print(f'<----- Epoch: {epoch+1} ----->')
    train_loss = 0
    valid_loss = 0

    train_acc = 0
    valid_acc = 0

    # training model on training dataset
    model.train()
    for batch in tqdm(train_dataloader, total=len(train_dataloader)):

        text_seq, labels = batch

        input_ids, attention_mask = text_seq['input_ids'].to(device), text_seq['attention_mask'].to(device)
        labels = labels.to(device)

        inputs = {'input_ids':input_ids, 'attention_mask': attention_mask}

        optimizer.zero_grad()

        output = model.forward(inputs)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        output_class = torch.sigmoid(output).argmax(dim=1)
        train_acc += (output_class == labels).sum().item()/len(output)

    # evaluating model on validation dataset
    model.eval()
    for batch in tqdm(val_dataloader, total=len(val_dataloader)):
    
        text_seq, labels = batch

        input_ids, attention_mask = text_seq['input_ids'].to(device), text_seq['attention_mask'].to(device)
        labels = labels.to(device)

        inputs = {'input_ids':input_ids, 'attention_mask': attention_mask}

        output = model(inputs)
        loss = loss_fn(output, labels)

        valid_loss += loss.item()

        output_class = torch.sigmoid(output).argmax(dim=1)
        valid_acc += (output_class == labels).sum().item()/len(output)

    # calculating losses
    train_loss = train_loss / len(train_dataloader)
    valid_loss = valid_loss / len(val_dataloader)

    # calculating accuracy
    train_acc = train_acc / len(train_dataloader)
    valid_acc = valid_acc / len(val_dataloader)

    scheduler.step(valid_loss)

    # Early stopping
    if valid_loss < best_val_loss:
        best_val_loss = valid_loss
        print('Saving Model!!')
        torch.save(model.state_dict(), save_path)
        _patience = patience  # reset _patience
    else:
        _patience -= 1
        if not _patience:  # 0
            print("Stopping early!")
            break

    # storing losses and accuracy
    results['train_losses'].append(train_loss)
    results['valid_losses'].append(valid_loss)
    results['train_acc'].append(train_acc)
    results['valid_acc'].append(valid_acc)

    print('Training Loss: {:.6f} \tTraining Accuracy: {:.2f}'.format(train_loss, train_acc))
    print('Validation Loss: {:.6f} \tValidation Accuracy: {:.2f}'.format(valid_loss, valid_acc))
    print(f"learning_rate: {optimizer.param_groups[0]['lr']:.2E} \tpatience: {_patience}\n")

# Plotting Loss and Accuracy Graphs

In [None]:
# code to plot loss and accuracy
def plot_loss_accuracy(train_loss, train_acc, validation_loss, validation_acc):
    epochs = len(train_loss)
    fig, (ax1, ax2) = plt.subplots(1, 2)
    ax1.plot(list(range(epochs)), train_loss, label='Training Loss')
    ax1.plot(list(range(epochs)), validation_loss, label='Validation Loss')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.set_title('Epoch vs Loss')
    ax1.legend()

    ax2.plot(list(range(epochs)), train_acc, label='Training Accuracy')
    ax2.plot(list(range(epochs)), validation_acc, label='Validation Accuracy')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Accuracy')
    ax2.set_title('Epoch vs Accuracy')
    ax2.legend()
    fig.set_size_inches(15.5, 5.5)
    #plt.show()
    
plot_loss_accuracy(results['train_losses'], results['train_acc'], results['valid_losses'], results['valid_acc'])

# Prediction on test Data

In [None]:
def pred_step(model, dataloader, device):
    """Validation or test step."""
    # Set model to eval mode
    model.eval()
    y_trues, y_probs = [], []

    # Iterate over val batches
    with torch.inference_mode():
        for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
            # Step
            text_seq = batch

            input_ids, attention_mask = text_seq['input_ids'].to(device), text_seq['attention_mask'].to(device)
            inputs = {'input_ids':input_ids, 'attention_mask': attention_mask}
            
            z = model(inputs)  # Forward pass

            # Store outputs
            y_prob = torch.sigmoid(z).cpu().numpy()
            y_probs.extend(y_prob)
            
    return np.vstack(y_probs)

In [None]:
model.load_state_dict(torch.load('./Bert-Model-1.pt'))

In [None]:
preds = pred_step(model, test_dataloader, device)

In [None]:
test_pred = preds.argmax(axis=1)

In [None]:
df = pd.DataFrame()
df['id'] = test_df['id']
df['target'] = test_pred.astype(int)

df.to_csv('submission.csv', index=False)
df