In [39]:
# !pip install datasets==2.14.6
# !pip install transformers
# !pip install evaluate
# !pip install --no-cache-dir transformers sentencepiece
# !pip install accelerate -U
# !pip install protobuf

In [65]:
import torch
from datasets import Dataset, DatasetDict
from datasets import concatenate_datasets
from torch.utils.data import DataLoader
import os
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModelForMultipleChoice, get_scheduler, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import evaluate
from sentence_transformers import SentenceTransformer
import accelerate
from sklearn.metrics import accuracy_score
from collections import Counter
import torch.nn as nn

In [66]:
train_data = np.load('./data/SP-train.npy', allow_pickle=True)

In [67]:
def get_data_lists(data):
    questions = []
    choices = []
    labels = []
    for example in data:
        # print(example.keys())
        questions.append(example['question'])
        choices.append(example['choice_list'])
        labels.append(example['label'])
    return questions, choices, labels

class SentenceModel(nn.Module):
    def __init__(self):
        super(SentenceModel, self).__init__()
        self.fc_1 = nn.Linear(768, 256)
        self.bn_1 = nn.BatchNorm1d(256)  
        self.dropout_1 = nn.Dropout(0.2)  
        self.fc_2 = nn.Linear(256, 32)
        self.bn_2 = nn.BatchNorm1d(32)  
        self.dropout_2 = nn.Dropout(0.3)  
        self.fc_3 = nn.Linear(32, 4)

    def forward(self, x):
        out = self.fc_1(x)
        out = self.bn_1(out)  
        out = nn.functional.relu(out)  
        out = self.dropout_1(out)  
        out = self.fc_2(out)
        out = self.bn_2(out)  
        out = nn.functional.relu(out)  
        out = self.dropout_2(out) 
        out = self.fc_3(out)
        return out

class GRUModel(nn.Module):
    def __init__(self):
        super(GRUModel, self).__init__()
        self.gru_1 = nn.GRU(768, 256, num_layers=1,batch_first=True)
        self.gru_2 = nn.GRU(256, 64, num_layers=1,batch_first=True)
        self.fc_1 = nn.Linear(64, 16)
        self.fc_2 = nn.Linear(16, 4)

    def forward(self, x):
        out, _ = self.gru_1(x)  
        out, _ = self.gru_2(out)              
        out = self.fc_1(out[:, -1, :])
        out = self.fc_2(out)
        return out
    
class RNNModel(nn.Module):
    def __init__(self):
        super(RNNModel, self).__init__()
        self.rnn_1 = nn.RNN(768, 256, num_layers=1,batch_first=True)
        self.rnn_2 = nn.RNN(256, 64, num_layers=1,batch_first=True)
        self.fc_1 = nn.Linear(64, 16)
        self.fc_2 = nn.Linear(16, 4)

    def forward(self, x):
        out, _ = self.rnn_1(x)  
        out, _ = self.rnn_2(out)              
        out = self.fc_1(out[:, -1, :])
        # out = nn.functional.relu(out)  
        out = self.fc_2(out)
        return out
    
class LSTMModel(nn.Module):
    def __init__(self):
        super(LSTMModel, self).__init__()
        self.lstm_1 = nn.LSTM(768, 256, num_layers=1,batch_first=True)
        self.lstm_2 = nn.LSTM(256, 64, num_layers=1,batch_first=True)
        self.fc_1 = nn.Linear(64, 16)
        self.fc_2 = nn.Linear(16, 4)

    def forward(self, x):
        out, _ = self.lstm_1(x)  
        out, _ = self.lstm_2(out)              
        out = self.fc_1(out[:, -1, :])
        # out = nn.functional.relu(out)  
        out = self.fc_2(out)
        return out

class Brain_Teaser(Dataset):
  def __init__(self, obj):
    self.questions = obj[0]
    self.choices = obj[1]
    self.labels = obj[2]

  def __len__(self):
    return len(self.questions)
  
  def __getitem__(self, id):
    return self.questions[id], self.choices[id], self.labels[id]  
  
class Brain_Teaser_2(Dataset):
  def __init__(self, tokenizer, questions, choices, labels, max_len=512):
    self.questions = questions
    self.choices = choices
    self.labels = labels

    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []
    self.question_options_encoded = []

    self.build_questions()
  
  def build_questions(self):
    maxi = 0
    for id in range(len(self.questions)):
      qo = []
      for option_id in range(4):
        question_options = "Question : " + self.questions[id] + ' ' + "Option: " + str(option_id) + ' ' + self.choices[id][option_id] 
        qo.append(self.tokenizer.encode(question_options, convert_to_tensor = True))
      self.question_options_encoded.append(qo)
    
  def __len__(self):
    return len(self.question_options_encoded)
  
  def __getitem__(self, id):
    return torch.stack(self.question_options_encoded[id]),  self.labels[id]  

In [68]:
data = np.load("data/SP-train.npy", allow_pickle=True)

o_data = []
sr_data = []
cr_data = []

for i in range(len(data)):
    size_ = len(data[i]['id'])

    if data[i]['id'][size_-2:size_] == 'CR':
        cr_data.append(data[i])
    elif data[i]['id'][size_-2:size_] == 'SR':
        sr_data.append(data[i])
    else:
        o_data.append(data[i])
model_name="bert-large-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_data = o_data[:int(len(o_data)*0.8)] + cr_data[:int(len(cr_data)*0.8)] + sr_data[:int(len(sr_data)*0.8)]
val_data = o_data[int(len(o_data)*0.8):int(len(o_data)*0.9)] + cr_data[int(len(cr_data)*0.8):int(len(cr_data)*0.9)] + sr_data[int(len(sr_data)*0.8):int(len(sr_data)*0.9)]
test_data = o_data[int(len(o_data)*0.9):] + cr_data[int(len(cr_data)*0.9):] + sr_data[int(len(sr_data)*0.9):]

o_test_data = o_data[int(len(o_data)*0.9):]
c_test_data = cr_data[int(len(cr_data)*0.9):] 
s_test_data =  sr_data[int(len(sr_data)*0.9):]

In [69]:
embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [70]:

o_test_dataset = Brain_Teaser(get_data_lists(o_test_data))
c_test_dataset = Brain_Teaser(get_data_lists(c_test_data))
s_test_dataset = Brain_Teaser(get_data_lists(s_test_data))

train_data = o_data[:int(len(o_data)*0.8)] + cr_data[:int(len(cr_data)*0.8)] + sr_data[:int(len(sr_data)*0.8)]
val_data = o_data[int(len(o_data)*0.8):int(len(o_data)*0.9)] + cr_data[int(len(cr_data)*0.8):int(len(cr_data)*0.9)] + sr_data[int(len(sr_data)*0.8):int(len(sr_data)*0.9)]
test_data = o_data[int(len(o_data)*0.9):] + cr_data[int(len(cr_data)*0.9):] + sr_data[int(len(sr_data)*0.9):]


train_questions, train_choices, train_labels = get_data_lists(train_data)
val_questions, val_choices, val_labels = get_data_lists(val_data)
test_questions, test_choices, test_labels = get_data_lists(test_data)

train_dataset_2 = Brain_Teaser_2(embedding_model, train_questions, train_choices, train_labels)
val_dataset_2 = Brain_Teaser_2(embedding_model, val_questions, val_choices, val_labels)
test_dataset_2 = Brain_Teaser_2(embedding_model, test_questions, test_choices, test_labels)

In [71]:
test_dataset_2[0]

(tensor([[-0.6462, -0.2440,  0.2941,  ..., -0.0312,  0.3269, -0.2828],
         [-0.4100, -0.1910,  0.3373,  ...,  0.0210,  0.3945, -0.5179],
         [-0.4757, -0.2962,  0.3105,  ...,  0.0132,  0.3523, -0.6067],
         [-0.3288, -0.3289,  0.4456,  ..., -0.0748,  0.2734, -0.7899]],
        device='mps:0'),
 3)

In [72]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Sentence Transfomers + Sequential Models

In [73]:
def test_model(test_dataset, model, batch = 1):
    total = 0
    correct = 0
    # test_dataloader = DataLoader(test_dataset, batch_size=batch, shuffle=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    model.to(device)
    with torch.no_grad():
        for id in range(len(test_dataset)):
            inputs = test_dataset[id][0].view(1,4,768)
            targets = test_dataset[id][1]
            inputs = inputs.to(device)
            # targets = targets.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += 1
            # print(predicted[0], targets)
            correct += (predicted == targets).sum().item()
    print(f"Test Accuracy: {(100 * correct / total):.2f}%")


In [74]:
print("RNN")
model_sentence_rnn = RNNModel()
model_sentence_rnn.load_state_dict(torch.load('trained_models/sb_rnn.pt'))
test_model(test_dataset_2, model_sentence_rnn)

print("LSTM")
model_sentence_lstm = LSTMModel()
model_sentence_lstm.load_state_dict(torch.load('trained_models/sb_lstm.pt'))
test_model(test_dataset_2, model_sentence_lstm)

print("GRU")
model_sentence_gru = GRUModel()
model_sentence_gru.load_state_dict(torch.load('trained_models/sb_gru.pt'))
test_model(test_dataset_2, model_sentence_gru)

RNN
Test Accuracy: 47.06%
LSTM
Test Accuracy: 31.37%
GRU
Test Accuracy: 27.45%


# Encoder Models

In [75]:
model_name="microsoft/deberta-v3-base"
tokenizer_deberta = AutoTokenizer.from_pretrained(model_name)
model_deberta = AutoModelForMultipleChoice.from_pretrained("trained_models/deberta_model", ignore_mismatched_sizes=True).to(device)



In [76]:
model_name= "microsoft/deberta-v3-base"
tokenizer_deberta_ft = AutoTokenizer.from_pretrained(model_name)
model_deberta_ft = AutoModelForMultipleChoice.from_pretrained("trained_models/deberta_ft_model", ignore_mismatched_sizes=True).to(device)

In [77]:
model_name="FacebookAI/roberta-large"
tokenizer_roberta = AutoTokenizer.from_pretrained(model_name)
model_roberta = AutoModelForMultipleChoice.from_pretrained("trained_models/roberta_model", ignore_mismatched_sizes=True).to(device)

In [78]:
model_name="DeepPavlov/roberta-large-winogrande"
tokenizer_roberta_w = AutoTokenizer.from_pretrained(model_name)
model_roberta_w = AutoModelForMultipleChoice.from_pretrained("trained_models/roberta_wngrd_model", ignore_mismatched_sizes=True).to(device)

In [79]:
def get_predictions(dataset, model, tokenizer):

    model.eval()
    predictions = []
    targets = []
    
    for id in range(len(dataset)):
        ques = dataset[id][0]
        choices =  dataset[id][1]
        true_label =  dataset[id][2]

        inputs = tokenizer([[ques, choices[0]], [ques, choices[1]], [ques, choices[2]], [ques, choices[3]]], return_tensors = "pt", padding = True).to(device)
        with torch.no_grad():
            outputs = model(**{key: value.unsqueeze(0) for key, value in inputs.items()})
        logits = outputs.logits
        predicted_class = logits.argmax().item()
        predictions.append(predicted_class)
        targets.append(true_label)
    
    return predictions, targets

def ensemble_predictions(predictions):
    num_samples = len(predictions[0])
    ensemble_pred = []

    for i in range(num_samples):
        counts = Counter([pred[i] for pred in predictions])
        majority_vote = counts.most_common(1)[0][0]
        ensemble_pred.append(majority_vote)

    return ensemble_pred

## Original Puzzles

In [80]:
deberta_prediction_o, target_o = get_predictions(o_test_dataset, model_deberta, tokenizer_deberta)
print("Deberta ")
print(accuracy_score(target_o, deberta_prediction_o ))

deberta_ft_prediction_o, target_o = get_predictions(o_test_dataset, model_deberta_ft, tokenizer_deberta_ft)
print("Deberta FT")
print(accuracy_score(target_o, deberta_ft_prediction_o ))

roberta_prediction_o, target_o = get_predictions(o_test_dataset, model_roberta, tokenizer_roberta)
print("Roberta")
print(accuracy_score(target_o, roberta_prediction_o))

roberta_w_prediction_o, target_o = get_predictions(o_test_dataset, model_roberta_w, tokenizer_roberta_w)
print("Roberta Winogrande")
print(accuracy_score(target_o, roberta_w_prediction_o ))


Deberta 
0.7647058823529411
Deberta FT
0.8235294117647058
Roberta
0.7647058823529411
Roberta Winogrande
0.8235294117647058


## Semantically Reconstructed Puzzles

In [81]:
deberta_prediction_s, target_s = get_predictions(s_test_dataset, model_deberta, tokenizer_deberta)
print("Deberta ")
print(accuracy_score(target_s, deberta_prediction_s ))

deberta_ft_prediction_s, target_s = get_predictions(s_test_dataset, model_deberta_ft, tokenizer_deberta_ft)
print("Deberta FT")
print(accuracy_score(target_s, deberta_ft_prediction_s ))

roberta_prediction_s, target_s = get_predictions(s_test_dataset, model_roberta, tokenizer_roberta)
print("Roberta")
print(accuracy_score(target_s, roberta_prediction_s))

roberta_w_prediction_s, target_s = get_predictions(s_test_dataset, model_roberta_w, tokenizer_roberta_w)
print("Roberta Winogrande")
print(accuracy_score(target_s, roberta_w_prediction_s ))

Deberta 
0.7647058823529411
Deberta FT
0.8235294117647058
Roberta
0.7647058823529411
Roberta Winogrande
0.8235294117647058


## Contextually Reconstructed Puzzles

In [82]:

deberta_prediction_c, target_c = get_predictions(c_test_dataset, model_deberta, tokenizer_deberta)
print("Deberta ")
print(accuracy_score(target_c, deberta_prediction_c ))

deberta_ft_prediction_c, target_c = get_predictions(c_test_dataset, model_deberta_ft, tokenizer_deberta_ft)
print("Deberta FT")
print(accuracy_score(target_c, deberta_ft_prediction_c ))

roberta_prediction_c, target_c = get_predictions(c_test_dataset, model_roberta, tokenizer_roberta)
print("Roberta")
print(accuracy_score(target_c, roberta_prediction_c))

roberta_w_prediction_c, target_c = get_predictions(c_test_dataset, model_roberta_w, tokenizer_roberta_w)
print("Roberta Winogrande")
print(accuracy_score(target_c, roberta_w_prediction_c ))

Deberta 
0.7058823529411765
Deberta FT
0.8235294117647058
Roberta
0.5882352941176471
Roberta Winogrande
0.7058823529411765


## Overall Accuracies

In [83]:


print("Deberta ")
print(accuracy_score(target_o + target_s + target_c, deberta_prediction_o + deberta_prediction_s + deberta_prediction_c ))

print("Deberta FT")
print(accuracy_score(target_o + target_s + target_c, deberta_ft_prediction_o + deberta_ft_prediction_s + deberta_ft_prediction_c ))

print("Roberta")
print(accuracy_score(target_o + target_s + target_c, roberta_prediction_o + roberta_prediction_s + roberta_prediction_c ))

print("Roberta Winogrande")
print(accuracy_score(target_o + target_s + target_c, roberta_w_prediction_o + roberta_w_prediction_s + roberta_w_prediction_c ))


Deberta 
0.7450980392156863
Deberta FT
0.8235294117647058
Roberta
0.7058823529411765
Roberta Winogrande
0.7843137254901961


## Ensemble 

In [84]:

prediction_s = [
    deberta_prediction_s,
    roberta_w_prediction_s,
    deberta_ft_prediction_s
]

prediction_c = [
    deberta_prediction_c,
    roberta_w_prediction_c,
    deberta_ft_prediction_c
]

prediction_o = [
    deberta_prediction_o,
    roberta_w_prediction_o,
    deberta_ft_prediction_o
]


ensemble_pred_o = ensemble_predictions(prediction_o)
ensemble_pred_s = ensemble_predictions(prediction_s)
ensemble_pred_c = ensemble_predictions(prediction_c)


In [85]:
print(accuracy_score(target_o, ensemble_pred_o))
print(accuracy_score(target_s, ensemble_pred_s))
print(accuracy_score(target_c, ensemble_pred_c))

0.8823529411764706
0.8235294117647058
0.7058823529411765


## Experiment

In [86]:
o_test_dataset_2 = Brain_Teaser(get_data_lists(o_data))
c_test_dataset_2 = Brain_Teaser(get_data_lists(cr_data))
s_test_dataset_2 = Brain_Teaser(get_data_lists(sr_data))

In [87]:
model_deberta_ft_o = AutoModelForMultipleChoice.from_pretrained("trained_models/deberta_original", ignore_mismatched_sizes=True).to(device)
deberta_ft_prediction_s, target_s = get_predictions(s_test_dataset_2, model_deberta_ft_o, tokenizer)
print("Semantic Puzzles")
print(accuracy_score(target_s, deberta_ft_prediction_s))

deberta_ft_prediction_c, target_c = get_predictions(c_test_dataset_2, model_deberta_ft_o, tokenizer)
print("Context Puzzles")
print(accuracy_score(target_c, deberta_ft_prediction_c))

Semantic Puzzles
1.0
Context Puzzles
0.893491124260355


In [88]:
model_deberta_ft_s = AutoModelForMultipleChoice.from_pretrained("trained_models/deberta_semantic", ignore_mismatched_sizes=True).to(device)
deberta_ft_prediction_o, target_o = get_predictions(o_test_dataset_2, model_deberta_ft_s, tokenizer)
print("Original Puzzles")
print(accuracy_score(target_o, deberta_ft_prediction_o))

deberta_ft_prediction_c, target_c = get_predictions(c_test_dataset_2, model_deberta_ft_s, tokenizer)
print("Context Puzzles")
print(accuracy_score(target_c, deberta_ft_prediction_c))

Original Puzzles
0.9940828402366864
Context Puzzles
0.8875739644970414


In [89]:
model_deberta_ft_c = AutoModelForMultipleChoice.from_pretrained("trained_models/deberta_context", ignore_mismatched_sizes=True).to(device)
deberta_ft_prediction_o, target_o = get_predictions(o_test_dataset_2, model_deberta_ft_c, tokenizer)
print("Original Puzzles")
print(accuracy_score(target_o, deberta_ft_prediction_o))

deberta_ft_prediction_s, target_s = get_predictions(s_test_dataset_2, model_deberta_ft_c, tokenizer)
print("Semantic Puzzles")
print(accuracy_score(target_s, deberta_ft_prediction_s))

Original Puzzles
0.863905325443787
Semantic Puzzles
0.8579881656804734
