# Install

In [None]:
!pip install transformers
!pip install nlpaug

# Imports

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random
import re

import nlpaug 
import nlpaug.augmenter.word as naw
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
import nltk


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import gc
torch.cuda.empty_cache()
gc.collect()

3

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data read

In [6]:
# Loading the database
url_train = "/content/drive/MyDrive/F1_Claim_Detection_train.csv"
# url_train = "/content/drive/MyDrive/augmented.csv"
df_train = pd.read_csv(url_train)

url_test = "/content/drive/MyDrive/F1_Claim_Detection_test.csv"
df_test = pd.read_csv(url_test)

# Data augmentation

In [7]:
aug = naw.SynonymAug(aug_src='wordnet',aug_max=1)
id = 6985
for i in range(len(df_train)):
  if(df_train["label"].iloc[i] == 0):
    # id += 1
    # tweet_augmented_1 = aug.augment(df_train["tweet"].iloc[i],n=1)
    # df2 = {'tweet': tweet_augmented_1[0], 'label': 0, 'id':id}
    # df_train = df_train.append(df2, ignore_index = True)
    id += 1
    tweet_augmented_2 = aug.augment(df_train["tweet"].iloc[i],n=1)
    df2 = {'tweet': tweet_augmented_2[0], 'label': 0, 'id':id}
    df_train = df_train.append(df2, ignore_index = True)

# Preprocessing

In [8]:
# preprocessing the data

def preprocess(text):
  text = re.sub(r"@\w+\b", " ", text)
  text = re.sub(r"https?:\/\/\w*|\w+\.com\w*", " ", text)
  text = re.sub("<\w*>", " ", text)
  text = re.sub(r"\\n", " ", text)
  text = re.sub("\s+", " ", text)
  return text
df_train["tweet"] = df_train["tweet"].apply(lambda x: preprocess(x))
df_test["tweet"] = df_test["tweet"].apply(lambda x: preprocess(x))

In [9]:
df_train.head()

Unnamed: 0,tweet,label,id
0,rt phin coffe club cure protect coronaviru the...,1,0
1,look like corona viru antiblack iâm okay co cm...,0,1
2,gonna monitor possit ncov viru know gonna say ...,1,2
3,safe distanc cure covid19 part stay away face ...,1,3
4,dose mimosa champagn cocain help get right cov...,0,4


# Model

## Bert Tokenizer

In [10]:
# Initializing the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

In [11]:
text_train = df_train["tweet"]
text_test = df_test["tweet"]

In [12]:
token_ids = []
attention_masking = []

# encoding
for sent in text_train:
  ed = tokenizer.encode_plus(sent, add_special_tokens = True, max_length = 150, pad_to_max_length = True, return_attention_mask = True, return_tensors = 'pt')
  token_ids.append(ed["input_ids"])
  attention_masking.append(ed["attention_mask"])

token_ids = torch.cat(token_ids, dim = 0)
attention_masking = torch.cat(attention_masking, dim = 0)
labels = torch.tensor(df_train["label"])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


## Setting hyperparameters

In [13]:
batch_size = 100

# train_idc, test_idc = train_test_split(np.arange(len(labels)), test_size = 0.2, shuffle = True, stratify = labels)
train_set = TensorDataset(token_ids[np.arange(len(labels))], attention_masking[np.arange(len(labels))], labels[np.arange(len(labels))])

# test_set = TensorDataset(token_ids[test_idc], attention_masking[test_idc], labels[test_idc])

train_dataloader = DataLoader(train_set, sampler = RandomSampler(train_set), batch_size = batch_size)

# test_dataloader = DataLoader(test_set, sampler = SequentialSampler(test_set), batch_size = batch_size)

In [14]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2, output_attentions = False, output_hidden_states = False,)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 5e-5, eps = 1e-08)
model.cuda()

In [16]:
device = torch.device('cuda')
epochs = 3

## Evaluation Metrics

In [17]:

def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

## Training

In [18]:
for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    # model.eval()

    # # Tracking variables 
    # val_accuracy = []
    # val_precision = []
    # val_recall = []
    # val_specificity = []

    # for batch in test_dataloader:
    #     batch = tuple(t.to(device) for t in batch)
    #     b_input_ids, b_input_mask, b_labels = batch
    #     with torch.no_grad():
    #       # Forward pass
    #       eval_output = model(b_input_ids, 
    #                           token_type_ids = None,
    #                           attention_mask = b_input_mask)
    #     logits = eval_output.logits.detach().cpu().numpy()
    #     label_ids = b_labels.to('cpu').numpy()
    #     # Calculate validation metrics
    #     b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
    #     val_accuracy.append(b_accuracy)
    #     # Update precision only when (tp + fp) !=0; ignore nan
    #     if b_precision != 'nan': val_precision.append(b_precision)
    #     # Update recall only when (tp + fn) !=0; ignore nan
    #     if b_recall != 'nan': val_recall.append(b_recall)
    #     # Update specificity only when (tn + fp) !=0; ignore nan
    #     if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    # print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    # print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    # print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    # print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')


Epoch:  33%|███▎      | 1/3 [02:38<05:17, 158.64s/it]


	 - Train loss: 0.3892


Epoch:  67%|██████▋   | 2/3 [05:15<02:37, 157.71s/it]


	 - Train loss: 0.3292


Epoch: 100%|██████████| 3/3 [07:52<00:00, 157.50s/it]


	 - Train loss: 0.2683





## Making predictions

In [19]:
def getPred(sent):


  # We need Token IDs and Attention Mask for inference on the new sentence
  test_ids = []
  test_attention_mask = []

  # Apply the tokenizer
  encoding = tokenizer.encode_plus(sent, add_special_tokens = True, max_length = 150, pad_to_max_length = True, return_attention_mask = True, return_tensors = 'pt')

  # Extract IDs and Attention Mask
  test_ids.append(encoding['input_ids'])
  test_attention_mask.append(encoding['attention_mask'])
  test_ids = torch.cat(test_ids, dim = 0)
  test_attention_mask = torch.cat(test_attention_mask, dim = 0)

  # Forward pass, calculate logit predictions
  with torch.no_grad():
    output = model(test_ids.to(device), attention_mask = test_attention_mask.to(device))

  return np.argmax(output.logits.cpu().numpy()).flatten().item()


In [20]:
preds = []
for sent in df_test["tweet"]:
  preds.append(getPred(sent))

In [21]:
print(preds.count(0))

151


# Generating output on test set

In [22]:
df_out = pd.DataFrame()
df_out["label"] =  preds

df_out['id'] = df_out.index

In [23]:
df_out.to_csv('output.csv')

# Reference

In [None]:
# The blog followed was https://towardsdatascience.com/fine-tuning-bert-for-text-classification-54e7df642894