In [None]:
!pip install tweet-preprocessor



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))


Found GPU at: /device:GPU:0


In [None]:
!pip install pytorch-transformers
!pip install transformers



In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import preprocessor as p

from transformers import XLNetTokenizer, XLNetForSequenceClassification, DebertaTokenizer, DebertaForSequenceClassification, RobertaTokenizerFast, RobertaForSequenceClassification, ElectraTokenizer, ElectraForSequenceClassification
from transformers import AdamW

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)


'Tesla P100-PCIE-16GB'

In [None]:
df = pd.read_csv("../data/Constraint_Train.csv")
val_df = pd.read_csv("../data/Constraint_Val.csv")
test_df = pd.read_csv("../data/Constraint_Test.csv")


In [None]:
val_df.shape

(2140, 3)

In [None]:
def preprocess(row):
  text = row['tweet']
  # text = text.strip('\xa0')
  text = p.clean(text)
  # text = re.sub(r'\([0-9]+\)', '', text).strip()    
  return text


In [None]:
# df['tweet'] = df.apply(lambda x: preprocess(x), 1)
# val_df['tweet'] = val_df.apply(lambda x: preprocess(x), 1)
# test_df['tweet'] = test_df.apply(lambda x: preprocess(x), 1)


In [None]:
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [None]:
df.iloc[1, 1]

'States reported 1121 deaths a small rise from last Tuesday. Southern states reported 640 of those deaths. https://t.co/YASGRTT4ux'

In [None]:
test_df.head(2)

Unnamed: 0,id,tweet
0,1,Our daily update is published. States reported...
1,2,Alfalfa is the only cure for COVID-19.


In [None]:
def map_label(row):
  return 0 if row['label']=='real' else 1

df['label_encoded'] = df.apply(lambda x: map_label(x), 1)
val_df['label_encoded'] = val_df.apply(lambda x: map_label(x), 1)
# test_df['label_encoded'] = test_df.apply(lambda x: map_label(x), 1)


In [None]:
train_sentences = df.tweet.values
train_token_ids = df.id.values
val_sentences = val_df.tweet.values
val_token_ids = val_df.id.values
test_sentences = test_df.tweet.values
test_token_ids = test_df.id.values


In [None]:
train_sentences = [sentence + " [SEP] [CLS]" for sentence in train_sentences]
train_labels = df.label_encoded.values
val_sentences = [sentence + " [SEP] [CLS]" for sentence in val_sentences]
val_labels = val_df.label_encoded.values
test_sentences = [sentence + " [SEP] [CLS]" for sentence in test_sentences]


In [None]:
MAX_LEN = 128
batch_size = 64


In [None]:
def get_dataloader(network, train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids):
  dic = {"XLNET": "xlnet-base-cased", "ROBERT": "roberta-base", "XLM-ROBERT": "xlm-roberta-base", "DeBERTa": "microsoft/deberta-base", "Electra": "google/electra-base-discriminator"}
  if network=='XLNET':
    tokenizer = XLNetTokenizer.from_pretrained(dic[network], do_lower_case=True)
  elif network=='DeBERTa':
    tokenizer = DebertaTokenizer.from_pretrained(dic[network], do_lower_case=True)
  elif network=='Electra':
    tokenizer = ElectraTokenizer.from_pretrained(dic[network], do_lower_case=True)
  else:
    tokenizer = RobertaTokenizerFast.from_pretrained(dic[network], do_lower_case=True)

  tokenized_train_texts = [tokenizer.tokenize(sent) for sent in train_sentences]
  print ("Tokenize the first sentence:")
  print (tokenized_train_texts[0])

  tokenized_val_texts = [tokenizer.tokenize(sent) for sent in val_sentences]
  tokenized_test_texts = [tokenizer.tokenize(sent) for sent in test_sentences]
  input_train_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_train_texts]
  input_val_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_val_texts]
  input_test_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_test_texts]

  input_train_ids = pad_sequences(input_train_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
  input_val_ids = pad_sequences(input_val_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
  input_test_ids = pad_sequences(input_test_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  # Create attention masks
  train_attention_masks = []

  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_train_ids:
    seq_mask = [float(i>0) for i in seq]
    train_attention_masks.append(seq_mask)
  train_masks = np.array(train_attention_masks)

  # Create attention masks
  val_attention_masks = []

  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_val_ids:
    seq_mask = [float(i>0) for i in seq]
    val_attention_masks.append(seq_mask)
  validation_masks = np.array(val_attention_masks)

  # Create attention masks
  test_attention_masks = []

  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_test_ids:
    seq_mask = [float(i>0) for i in seq]
    test_attention_masks.append(seq_mask)
  test_masks = np.array(test_attention_masks)

  train_inputs = torch.tensor(input_train_ids)
  validation_inputs = torch.tensor(input_val_ids)
  test_inputs = torch.tensor(input_test_ids)
  train_labels = torch.tensor(train_labels)
  validation_labels = torch.tensor(val_labels)
  train_masks = torch.tensor(train_masks)
  validation_masks = torch.tensor(validation_masks)
  test_masks = torch.tensor(test_masks)

  train_data = TensorDataset(torch.tensor(train_token_ids), train_inputs, train_masks, train_labels)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  validation_data = TensorDataset(torch.tensor(val_token_ids), validation_inputs, validation_masks, validation_labels)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  test_data = TensorDataset(torch.tensor(test_token_ids), test_inputs, test_masks)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  return train_dataloader, validation_dataloader, test_dataloader


In [None]:
train_dataloader_XLNET, val_dataloader_XLNET, test_dataloader_XLNET = get_dataloader("XLNET", train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids)


Tokenize the first sentence:
['▁the', '▁c', 'd', 'c', '▁currently', '▁reports', '▁9', '90', '31', '▁deaths', '.', '▁in', '▁general', '▁the', '▁discrepancies', '▁in', '▁death', '▁counts', '▁between', '▁different', '▁sources', '▁are', '▁small', '▁and', '▁', 'exp', 'lic', 'able', '.', '▁the', '▁death', '▁toll', '▁stands', '▁at', '▁roughly', '▁100', '000', '▁people', '▁today', '.', '▁[', 's', 'ep', ']', '▁[', 'cl', 's', ']']


In [None]:
train_dataloader_ROBERT, val_dataloader_ROBERT, test_dataloader_ROBERT = get_dataloader("ROBERT", train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids)


Tokenize the first sentence:
['The', 'ĠCDC', 'Ġcurrently', 'Ġreports', 'Ġ99', '031', 'Ġdeaths', '.', 'ĠIn', 'Ġgeneral', 'Ġthe', 'Ġdiscrepancies', 'Ġin', 'Ġdeath', 'Ġcounts', 'Ġbetween', 'Ġdifferent', 'Ġsources', 'Ġare', 'Ġsmall', 'Ġand', 'Ġexpl', 'icable', '.', 'ĠThe', 'Ġdeath', 'Ġtoll', 'Ġstands', 'Ġat', 'Ġroughly', 'Ġ100', '000', 'Ġpeople', 'Ġtoday', '.', 'Ġ[', 'SE', 'P', ']', 'Ġ[', 'CL', 'S', ']']


In [None]:
train_dataloader_XLM_ROBERT, val_dataloader_XLM_ROBERT, test_dataloader_XLM_ROBERT = get_dataloader("XLM-ROBERT", train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids)


Tokenize the first sentence:
['▁The', '▁CD', 'C', '▁currently', '▁reports', '▁', '990', '31', '▁death', 's', '.', '▁In', '▁general', '▁the', '▁disc', 're', 'pa', 'ncies', '▁in', '▁death', '▁count', 's', '▁between', '▁different', '▁sources', '▁are', '▁small', '▁and', '▁explica', 'ble', '.', '▁The', '▁death', '▁toll', '▁stand', 's', '▁at', '▁rough', 'ly', '▁10', '0000', '▁people', '▁today', '.', '▁[', 'S', 'EP', ']', '▁[', 'C', 'LS', ']']


In [None]:
train_dataloader_DeBERTa, val_dataloader_DeBERTa, test_dataloader_DeBERTa = get_dataloader("DeBERTa", train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids)


Tokenize the first sentence:
['1169', '269', '17896', '3058', '3136', '7388', '43637', '7040', '13', '287', '2276', '262', '42420', '287', '1918', '9853', '1022', '1180', '4237', '389', '1402', '290', '1193', '18424', '13', '262', '1918', '13592', '6296', '379', '7323', '1802', '830', '661', '1909', '13', '[SEP]', '[CLS]']


In [None]:
# train_dataloader_Electra, val_dataloader_Electra, test_dataloader_Electra = get_dataloader("Electra", train_sentences, val_sentences, train_labels, val_labels, test_sentences, train_token_ids, val_token_ids, test_token_ids)


In [None]:
model1 = RobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)
directory_path = '/content/drive/MyDrive/Constraint/models/Best models'
model1.load_state_dict(torch.load(directory_path+'/XLM-ROBERTa_base_preprocess_link_v1.ckpt'))
model1.eval()
model1.cuda()


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'c

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

In [None]:
model2 = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
directory_path = '/content/drive/MyDrive/Constraint/models/Best models'
model2.load_state_dict(torch.load(directory_path+'/ROBERTa_base_preprocess_v2.ckpt'))
model2.eval()
model2.cuda()


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [None]:
model3 = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
directory_path = '/content/drive/MyDrive/Constraint/models/Best models'
model3.load_state_dict(torch.load(directory_path+'/XLNet_base_cased_v2.ckpt'))
model3.eval()
model3.cuda()


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

In [None]:
model4 = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)
directory_path = '/content/drive/MyDrive/Constraint/models/Best models'
model4.load_state_dict(torch.load(directory_path+'/DeBERTa_base_preprocess_link_v1.ckpt'))
model4.eval()
model4.cuda()




Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'config', 'deberta.embeddings.position_embeddings.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (intermed

In [None]:
# model5 = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)
# directory_path = '/content/drive/MyDrive/Constraint/models/Best models'
# model5.load_state_dict(torch.load(directory_path+'/Electra_base_0.973414_preprocess_model.ckpt'))
# model5.eval()
# model5.cuda()



In [None]:
def get_model_preds_labels(model, dataloader, mode='train'):
  data_vectors = []
  labels = []
  ids = []

  with torch.no_grad():
      correct = 0
      total = 0
      for i, batch in enumerate(dataloader):
        batch = tuple(t.to(device) for t in batch)
        if mode=='test':
            token_ids, b_input_ids, b_input_mask = batch
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            for point, tk_id in zip(F.softmax(outputs.logits).tolist(), token_ids.tolist()):
              data_vectors.append(point)       
              ids.append(tk_id)     
        else:
          # Unpack the inputs from our dataloader
          token_ids, b_input_ids, b_input_mask, b_labels = batch
          # Forward pass
          outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
          for point, lab, tk_id in zip(F.softmax(outputs.logits).tolist(), b_labels.tolist(), token_ids.tolist()):
            data_vectors.append(point)
            labels.append(lab)
            ids.append(tk_id)

  if mode=='test':
    return data_vectors, ids
  else:
    return data_vectors, labels, ids


# Train Data Creation

In [None]:
m1, lb, train_ids1 = get_model_preds_labels(model1, train_dataloader_XLM_ROBERT)



In [None]:
m2, _, train_ids2 = get_model_preds_labels(model2, train_dataloader_ROBERT)



In [None]:
m3, _, train_ids3 = get_model_preds_labels(model3, train_dataloader_XLNET)



In [None]:
m4, _, train_ids4 = get_model_preds_labels(model4, train_dataloader_DeBERTa)



In [None]:
# m5, _, train_ids5 = get_model_preds_labels(model5, train_dataloader_Electra)

In [None]:
a = [each[0] for each in m1]
b = [each[1] for each in m1]
train_df = pd.DataFrame({'id': train_ids1, 'model1_Class0': a, 'model1_Class1': b})


In [None]:
def create_dataset(df, ids, model_op, col1, col2):
  a = [each[0] for each in model_op]
  b = [each[1] for each in model_op]
  new_df = pd.DataFrame({'id': ids, col1: a, col2: b})
  df = df.merge(new_df, on='id', how='left')

  return df



In [None]:
train_df = create_dataset(train_df,train_ids2,m2, 'model2_Class0', 'model2_Class1')
train_df = create_dataset(train_df,train_ids3,m3, 'model3_Class0', 'model3_Class1')
train_df = create_dataset(train_df,train_ids4,m4, 'model4_Class0', 'model4_Class1')
# train_df = create_dataset(train_df,train_ids4,m4, 'model5_Class0', 'model5_Class1')
train_df = train_df.merge(pd.DataFrame({'id': train_ids1, 'Label': lb}))


In [None]:
train_df.shape, train_df.columns

((6420, 10), Index(['id', 'model1_Class0', 'model1_Class1', 'model2_Class0',
        'model2_Class1', 'model3_Class0', 'model3_Class1', 'model4_Class0',
        'model4_Class1', 'Label'],
       dtype='object'))

In [None]:
# train_df = pd.DataFrame(train_X, columns = ["model1_Class0", "model1_Class1", "model2_Class0", "model2_Class1", "model3_Class0", "model3_Class1", "model4_Class0", "model4_Class1"])
# train_df['Label'] = train_y
train_df = train_df.sort_values(by='id').reset_index(drop=True)
train_df

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,Label
0,1,0.999289,0.000711,0.999076,0.000924,0.999856,0.000144,0.999958,0.000042,0
1,2,0.999703,0.000297,0.998154,0.001846,0.999927,0.000073,0.999892,0.000108,0
2,3,0.000418,0.999582,0.000937,0.999063,0.000026,0.999974,0.000300,0.999700,1
3,4,0.999864,0.000136,0.999035,0.000965,0.999930,0.000070,0.999961,0.000039,0
4,5,0.999843,0.000157,0.999071,0.000929,0.999947,0.000053,0.999956,0.000044,0
...,...,...,...,...,...,...,...,...,...,...
6415,6416,0.001224,0.998776,0.000889,0.999111,0.006213,0.993787,0.000114,0.999886,1
6416,6417,0.000384,0.999616,0.000913,0.999087,0.000037,0.999963,0.000053,0.999947,1
6417,6418,0.000645,0.999355,0.000925,0.999075,0.000098,0.999902,0.000077,0.999923,1
6418,6419,0.000401,0.999600,0.000931,0.999069,0.000041,0.999959,0.000459,0.999541,1


In [None]:
train_df.to_csv('Boosting Data/Train_v2.csv', index=False)


# Val Data Creation

In [None]:
v1, v_lb, v_ids1 = get_model_preds_labels(model1, val_dataloader_XLM_ROBERT)



In [None]:
v2, _, v_ids2 = get_model_preds_labels(model2, val_dataloader_ROBERT)



In [None]:
v3, _, v_ids3 = get_model_preds_labels(model3, val_dataloader_XLNET)



In [None]:
v4, _, v_ids4 = get_model_preds_labels(model4, val_dataloader_DeBERTa)



In [None]:
# v5, _, v_ids5 = get_model_preds_labels(model5, val_dataloader_Electra)

In [None]:
a = [each[0] for each in v1]
b = [each[1] for each in v1]
validation_df = pd.DataFrame({'id': v_ids1, 'model1_Class0': a, 'model1_Class1': b})


In [None]:
validation_df = create_dataset(validation_df, v_ids2, v2, 'model2_Class0', 'model2_Class1')
validation_df = create_dataset(validation_df, v_ids3, v3, 'model3_Class0', 'model3_Class1')
validation_df = create_dataset(validation_df, v_ids4, v4, 'model4_Class0', 'model4_Class1')
# validation_df = create_dataset(validation_df, v_ids5, v5, 'model5_Class0', 'model5_Class1')
validation_df = validation_df.merge(pd.DataFrame({'id': v_ids1, 'Label': v_lb}))


In [None]:
validation_df.shape, validation_df.columns

((2140, 10), Index(['id', 'model1_Class0', 'model1_Class1', 'model2_Class0',
        'model2_Class1', 'model3_Class0', 'model3_Class1', 'model4_Class0',
        'model4_Class1', 'Label'],
       dtype='object'))

In [None]:
validation_df = validation_df.sort_values(by='id').reset_index(drop=True)
validation_df

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,Label
0,1,0.001269,0.998731,0.000938,0.999062,0.000179,0.999821,0.028954,0.971046,1
1,2,0.991674,0.008326,0.000884,0.999116,0.010019,0.989981,0.002718,0.997282,1
2,3,0.000385,0.999615,0.000958,0.999042,0.000084,0.999916,0.000053,0.999947,1
3,4,0.000416,0.999584,0.000936,0.999065,0.000109,0.999891,0.001268,0.998732,1
4,5,0.999851,0.000149,0.999052,0.000948,0.999784,0.000216,0.999951,0.000049,0
...,...,...,...,...,...,...,...,...,...,...
2135,2136,0.000510,0.999490,0.000900,0.999100,0.000543,0.999457,0.001650,0.998350,1
2136,2137,0.999858,0.000142,0.999087,0.000913,0.999942,0.000058,0.999962,0.000037,0
2137,2138,0.004834,0.995166,0.000884,0.999116,0.000149,0.999851,0.048908,0.951092,1
2138,2139,0.018657,0.981343,0.001124,0.998876,0.029504,0.970496,0.920868,0.079132,1


In [None]:
validation_df.to_csv('Boosting Data/Validation_v2.csv', index=False)


# Test Data Creation

In [None]:
t1, t_ids1 = get_model_preds_labels(model1, test_dataloader_XLM_ROBERT, 'test')

  


In [None]:
t2, t_ids2 = get_model_preds_labels(model2, test_dataloader_ROBERT, 'test')

  


In [None]:
t3, t_ids3 = get_model_preds_labels(model3, test_dataloader_XLNET, 'test')

  


In [None]:
t4, t_ids4 = get_model_preds_labels(model4, test_dataloader_DeBERTa, 'test')

  


In [None]:
# t5, t_ids5 = get_model_preds_labels(model5, test_dataloader_Electra, 'test')

In [None]:
a = [each[0] for each in t1]
b = [each[1] for each in t1]
test_df = pd.DataFrame({'id': t_ids1, 'model1_Class0': a, 'model1_Class1': b})


In [None]:
test_df = create_dataset(test_df, t_ids2, t2, 'model2_Class0', 'model2_Class1')
test_df = create_dataset(test_df,  t_ids3, t3, 'model3_Class0', 'model3_Class1')
test_df = create_dataset(test_df, t_ids4, t4, 'model4_Class0', 'model4_Class1')
# test_df = create_dataset(test_df, t_ids5, t5, 'model5_Class0', 'model5_Class1')


In [None]:
test_df.shape, test_df.columns

((2140, 9), Index(['id', 'model1_Class0', 'model1_Class1', 'model2_Class0',
        'model2_Class1', 'model3_Class0', 'model3_Class1', 'model4_Class0',
        'model4_Class1'],
       dtype='object'))

In [None]:
test_df = test_df.sort_values(by='id').reset_index(drop=True)
test_df

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1
0,1,0.999851,0.000149,0.999063,0.000937,0.999967,0.000033,0.999963,0.000037
1,2,0.000422,0.999578,0.000902,0.999098,0.000045,0.999955,0.000112,0.999888
2,3,0.000401,0.999600,0.000954,0.999046,0.000033,0.999967,0.000170,0.999830
3,4,0.999858,0.000142,0.999082,0.000918,0.999967,0.000033,0.999956,0.000044
4,5,0.999852,0.000148,0.999076,0.000924,0.999953,0.000047,0.999912,0.000088
...,...,...,...,...,...,...,...,...,...
2135,2136,0.999821,0.000179,0.999020,0.000980,0.999920,0.000080,0.999965,0.000035
2136,2137,0.000393,0.999607,0.000905,0.999095,0.000248,0.999752,0.000039,0.999961
2137,2138,0.999857,0.000143,0.999084,0.000916,0.999925,0.000075,0.999950,0.000050
2138,2139,0.999844,0.000156,0.999072,0.000928,0.999778,0.000222,0.999931,0.000069


In [None]:
test_df.to_csv('Boosting Data/Test_v2.csv', index=False)


In [None]:
model1: "xlm-robert", model2: "robert", model3: "XLNET", model4: "DeBERTa", model5: "Electra"