In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !pip install lightning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.nn.utils.rnn import pad_sequence
from transformers import RobertaTokenizerFast
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from tqdm.autonotebook import tqdm
import torch.nn as nn

In [None]:
train_en = pd.read_json('/content/drive/MyDrive/ICPR24-CSI/data/train-en.json')

In [None]:
val_en = pd.read_json('/content/drive/MyDrive/ICPR24-CSI/data/val-en.json')

In [None]:
train_en['claims'][9]

[{'index': 0,
  'start': 26,
  'end': 33,
  'terms': 'imprisonment to prove living in abject fear'},
 {'index': 1, 'start': 41, 'end': 44, 'terms': 'most expensive vaccine'}]

In [None]:
train_en['text_tokens'][9]

['COVID',
 '19',
 'mortality',
 'Stats',
 'have',
 'been',
 'hammered',
 'into',
 'our',
 'heads',
 '24',
 '/',
 '7',
 'by',
 'MSM',
 'Every',
 'day',
 'they',
 'report',
 'hard',
 '"',
 'facts',
 '"',
 'to',
 'justify',
 'our',
 'imprisonment',
 'to',
 'prove',
 'living',
 'in',
 'abject',
 'fear',
 'is',
 'the',
 'only',
 'sensible',
 'reaction',
 'and',
 'only',
 'the',
 'most',
 'expensive',
 'vaccine',
 'ever',
 'devised',
 'can',
 'possibly',
 'save',
 'us',
 'Please',
 'Wake',
 'Up']

In [None]:
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def assign_labels(example):
  claims = example['claims']
  tokens = example['text_tokens']
  labels = np.zeros(len(tokens))
  for claim in claims:
    start = claim['start']
    end = claim['end']
    for i in range(start,end):
      labels[i] = 1
  return labels

In [None]:
def allign_labels_with_tokens(labels,word_ids):
  new_labels = []
  for word_id in word_ids:
    if word_id == None:
      new_labels.append(-100)
    else:
      new_labels.append(labels[word_id])
  return new_labels

In [None]:
def pad_labels(Labels, padding_value = -100):
  max_length = max(len(label_at_idx) for label_at_idx in Labels)

  padded_labels = [label_at_idx + [padding_value] * (max_length - len(label_at_idx)) for label_at_idx in Labels]

  return padded_labels

In [None]:
def preprocess(train_set):
  encoding = tokenizer(train_set['text_tokens'].tolist(), is_split_into_words = True, padding = True, return_tensors ='pt',)
  new_labels = []
  for i,example in train_set.iterrows():
    labels = assign_labels(example)
    word_ids = encoding[i].word_ids
    new_labels.append(allign_labels_with_tokens(labels,word_ids))
  padded_labels = pad_labels(new_labels)
  encoding['targets'] = padded_labels
  return encoding

In [None]:
### Now Encoding will have {input_ids:xyz , attention_mask: xyz , targets: xyz} where the targets are the labels for that example.

In [None]:
class Train_dataset(Dataset):
    def __init__(self, training_dataset):
        self.encodings  = preprocess(training_dataset)
        self.input_ids = self.encodings['input_ids']
        self.attention_mask = self.encodings['attention_mask']
        self.targets = self.encodings['targets']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]
        labels = self.targets[idx]
        return input_ids, attention_mask, torch.tensor(labels)

In [None]:
train_dataset = Train_dataset(train_en)
val_dataset = Train_dataset(val_en)

In [None]:
train_dataset[9]

(tensor([    0,  6247, 43814,   753, 15812, 29232,    33,    57, 22355,    88,
            84,  3885,   706,  1589,   262,    30, 43596,  4337,   183,    51,
           266,   543,    22,  4905,    22,     7, 11071,    84, 14804,     7,
          3364,  1207,    11,  4091, 21517,  2490,    16,     5,   129, 19653,
          4289,     8,   129,     5,   144,  3214,  9937,   655, 28921,    64,
          3544,  1871,   201,  3401, 11601,  3105,     2,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size = 8, shuffle = False)
val_dataloader = DataLoader(val_dataset, batch_size = 16, shuffle = False)

In [None]:
class config:
  # Model params
  SEED = 25
  N_FOLDS = 5
  EPOCHS = 4
  LEARNING_RATE = 3e-5
  PATIENCE = None
  EARLY_STOPPING_DELTA = None
  TRAIN_BATCH_SIZE = 32
  VALID_BATCH_SIZE = 32
  MAX_LEN = 96  # actually = 86
  HIDDEN_SIZE = 1024
  N_LAST_HIDDEN = 24
  HIGH_DROPOUT = 0.6
  SOFT_ALPHA = 0.4
  WARMUP_RATIO = 0.25
  WEIGHT_DECAY = 0.001
  USE_SWA = False
  SWA_RATIO = 0.9
  SWA_FREQ = 30

In [None]:
## Dhyan mein rakhna, that in definin the loss function((Binary) Cross Entropy we have to set ignore_index = -100.

In [None]:
# --> Prospects of Using Question_answering Pipeline.
# --> Explore RobertaForTokenClassification (with custom labels,config)
# --> (wrap in pytorch_lightning)
# --> Token-Level VS Character-Level.
# --> Validation: Jaccard and Individual token-level metrics
# --> Callbacks: Early Stopping & Checkpoints
# -->

In [None]:
class TweetModel2(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel2, self).__init__(conf)
        self.roberta = RobertaModel.from_pretrained('FacebookAI/roberta-large',output_hidden_states=True, add_pooling_layer=False)
        self.high_dropout = torch.nn.Dropout(0.3) #config.HIGH_DROPOUT
        self.classifier = torch.nn.Linear(config.HIDDEN_SIZE, 2)

    def forward(self, ids, attention_mask):
        # sequence_output of N_LAST_HIDDEN + Embedding states
        # (N_LAST_HIDDEN + 1, batch_size, num_tokens, 768)
        out = self.roberta(ids, attention_mask=attention_mask)
        out = out.last_hidden_state
        out = self.classifier(self.high_dropout(out))
        return out

In [None]:
# sample = train_en['text_tokens'][5:9].tolist()
# encoding = tokenizer(sample ,return_tensors='pt', padding = True,is_split_into_words=True)

# output = model(encoding['input_ids'],encoding['attention_mask'])

In [None]:
# del model

In [None]:
# output.transpose(1,2)[:,0,:].shape

In [None]:
from sklearn.metrics import jaccard_score

def get_pad_idxs(labels):
  pad_idx = []
  for i,j in enumerate(labels):
    if j == -100:
      pad_idx.append(i)
  return pad_idx

def padded_prediction(pad_idx,preds):
    for idx in pad_idx:
      preds[idx] = -100
    return preds

def calculate_jaccard_score(labels, probs, threshold):
    all_true_binary = []
    all_pred_binary = []

    for i in range(len(labels)):
        label = labels[i]
        probabilities = probs[i]

        preds = [1 if prob >= threshold else 0 for prob in probabilities]

        pred_labels = padded_prediction(get_pad_idxs(label),preds)

        # Removing padding (for jaccard_score to work(on binary labels only))
        label = [i for i in label if i != -100]
        pred_labels = [i for i in pred_labels if i != -100]

        all_true_binary.extend(label)
        all_pred_binary.extend(pred_labels)

    return jaccard_score(all_true_binary, all_pred_binary, zero_division=1.0)

def get_best_threshold(labels, probs, thresholds=np.linspace(0, 1, 101)):
    best_threshold = 0
    best_score = 0

    # for threshold in thresholds:
    score = calculate_jaccard_score(labels, probs, 0.32)
        # if score > best_score:
        #     best_score = score
        #     best_threshold = threshold
    return score
    # return best_threshold, best_score

In [None]:
def loss_function(true_labels, pred_logits):
  loss_fn = nn.CrossEntropyLoss(ignore_index = -100, reduction='mean')
  loss = loss_fn(pred_logits.view(-1,2),true_labels.view(-1)) # (All tokens in a batch, corresponding all labels in a batch)
  return loss

In [None]:
def evaluate(model, val_dataloader,device):
  jaccard_scores = np.array([])
  thresholds = np.array([])
  for batch in tqdm(val_dataloader, total = len(val_dataloader)):

    input_ids = batch[0].to(device , dtype = torch.int)

    attention_mask = batch[1].to(device, dtype = torch.float16)

    labels = batch[2].to(device, dtype = torch.float16)

    logits = model(input_ids,attention_mask)

    probs = F.softmax(logits,dim=2)[:,:,1]

    best_threshold,best_score = get_best_threshold(labels.detach().cpu().numpy(),probs.detach().cpu().numpy())

    jaccard_scores = np.append(jaccard_scores,best_score)

    thresholds = np.append(thresholds, best_threshold)

  return jaccard_scores, thresholds

In [None]:
def evaluate(model, val_dataloader,device):
  jaccard_scores = np.array([])
  thresholds = np.array([])
  for batch in tqdm(val_dataloader, total = len(val_dataloader)):

    input_ids = batch[0].to(device , dtype = torch.int)

    attention_mask = batch[1].to(device, dtype = torch.float16)

    labels = batch[2].to(device, dtype = torch.float16)

    logits = model(input_ids,attention_mask)

    probs = F.softmax(logits,dim=2)[:,:,1]

    best_score = get_best_threshold(labels.detach().cpu().numpy(),probs.detach().cpu().numpy())

    jaccard_scores = np.append(jaccard_scores,best_score)

    # thresholds = np.append(thresholds, best_threshold)

  return jaccard_scores #, thresholds

In [None]:
def training_loop_for_model_2(model, train_dataloader, val_dataloader,loss_function, optimizer,epochs,early_stopping = None):
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  losses = np.array([])
  # jaccard_scores = np.array([])
  # thresholds = np.array([])

  model.train()

  for epoch in range(epochs):
    tk0 = tqdm(train_dataloader, total = len(train_dataloader))
    for batch in tk0:

      input_ids = batch[0].to(device, dtype = torch.int)

      attention_mask = batch[1].to(device)

      labels = batch[2].to(device).long()

      logits = model(input_ids,attention_mask)
      logits = logits.float()
      loss = loss_function(labels, logits)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      losses = np.append(losses,loss.detach().cpu().numpy())
      del input_ids,attention_mask,labels,logits
  #   result = evaluate(model, val_dataloader, device)
  #   print(f'Epoch: {epoch}, Loss: {losses.mean()}, Best_jaacard_score: {result[0].mean()}, Threshold: {result[1].mean()}')
  # return result
    # torch.save(model.state_dict,'model.pth')

In [None]:
from transformers import RobertaConfig, RobertaModel
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Initializing a RoBERTa configuration
configuration = RobertaConfig()

# Initializing a model (with random weights) from the configuration
model = TweetModel2(configuration)
model.load_state_dict(torch.load('/content/drive/MyDrive/model-1 (2).pth'))
model.to(device)

TweetModel2(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNo

In [None]:
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-5)

In [None]:
# del model

In [None]:
results = training_loop_for_model_2(model,train_dataloader,val_dataloader,loss_function,optimizer,7)

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

In [None]:
results = training_loop_for_model_2(model,train_dataloader,val_dataloader,loss_function,optimizer,2)

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

In [None]:
torch.save(model.state_dict(),'model-1.pth')

In [None]:
## Jaccard Score on Training_set

In [None]:
def training_loop_for_model_2(model, train_dataloader, val_dataloader,loss_function, optimizer,epochs,early_stopping = None):
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  losses = np.array([])
  # jaccard_scores = np.array([])
  # thresholds = np.array([])

  # model.train()

  # for epoch in range(epochs):
  #   tk0 = tqdm(train_dataloader, total = len(train_dataloader))
  #   for batch in tk0:

  #     input_ids = batch[0].to(device, dtype = torch.int)

  #     attention_mask = batch[1].to(device)

  #     labels = batch[2].to(device).long()

  #     logits = model(input_ids,attention_mask)
  #     logits = logits.float()
  #     loss = loss_function(labels, logits)

  #     optimizer.zero_grad()
  #     loss.backward()
  #     optimizer.step()

  #     losses = np.append(losses,loss.detach().cpu().numpy())
  #     del input_ids,attention_mask,labels,logits
  result = evaluate(model, val_dataloader, device)
  # print(f'Best_jaacard_score: {result[0].mean()}, Threshold: {result[1].mean()}') # Chutiya kata
  return result

In [None]:
results = training_loop_for_model_2(model,train_dataloader,train_dataloader,loss_function,optimizer,3)

  0%|          | 0/750 [00:00<?, ?it/s]

Best_jaacard_score: 0.934325452753929, Threshold: 0.4744266666666667


In [None]:
eval_results = training_loop_for_model_2(model,train_dataloader,val_dataloader,loss_function,optimizer,3) ## model-1(2).pth

  0%|          | 0/63 [00:00<?, ?it/s]

Best_jaacard_score: 0.6220488943650151, Threshold: 0.3247619047619049


In [None]:
results = training_loop_for_model_2(model,train_dataloader,train_dataloader,loss_function,optimizer,3)

  0%|          | 0/750 [00:00<?, ?it/s]

Best_jaacard_score: 0.9471604454412902, Threshold: 0.61392


In [None]:
eval_results = training_loop_for_model_2(model,train_dataloader,val_dataloader,loss_function,optimizer,3)

  0%|          | 0/63 [00:00<?, ?it/s]

Best_jaacard_score: 0.6182127549243078, Threshold: 0.30476190476190473
