# Error Analysis

In [4]:
import pickle 
import torch
import nlp
from transformers import T5ForConditionalGeneration, T5Tokenizer
from tqdm.auto import tqdm
from sklearn import metrics
device = 'cuda' if torch.cuda.is_available else 'cpu'

## Models to evaluate

1. **T5 Base**: Finetuned Commonsense QA for 10 epochs 
2. **T5 Base + Social I QA**: Finetuned on Social I QA (4 epochs, batch 8, lr 1e-4), then finetuned on Commonsense QA (10 epochs, batch size 8, learning rate 1e-4, wu 0)
3. **T5 Base + Cosmos QA**: Finetuned on Cosmos QA (3 epochs, batch size 8, learning rate 5e-5), then finetuned on Commonsense QA (10 epochs, batch size 8, learning rate = 5e-5)
4. **T5 Base + hellaswag**:


In [3]:
# Load the validation dataset. Common to all analyses.
valid_dataset = torch.load('./data/commonsense_qa/valid_data.pt')
dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size = 32)

In [10]:
def predict_results(model_and_tokenizer_location):
    # Load the pretrained model
    model = T5ForConditionalGeneration.from_pretrained(model_and_tokenizer_location)
    tokenizer = T5Tokenizer.from_pretrained(model_and_tokenizer_location)

    # Generate predictions
    predictions = []
    targets = []
    model.to(device)    
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader):
            prediction = model.generate(input_ids = batch['input_ids'].to(device), 
                              attention_mask = batch['attention_mask'].to(device),
                              max_length = 16)
            prediction = [tokenizer.decode(ids) for ids in prediction]
            target = [tokenizer.decode(ids) for ids in batch['target_ids']]

            predictions.extend(prediction)
            targets.extend(target)
            
    return(predictions, targets, tokenizer)

In [109]:
class Answers:
    
    def __init__(self, predictions, targets, tokenizer):
        self.accuracy = metrics.accuracy_score(targets, predictions)
        
        # Find index of wrong answers 
        self.incorrect_idxs = [i for i, prediction in enumerate(predictions) 
                               if prediction != targets[i]]
        
        self.qq={}
        self.target_ans={}
        self.predicted_ans={}
        
        self.size = 0
        
        for incorrect_idx in self.incorrect_idxs:
            self.qq[incorrect_idx]           = tokenizer.decode(valid_dataset[incorrect_idx]['input_ids'] )
            self.target_ans[incorrect_idx]    = tokenizer.decode(valid_dataset[incorrect_idx]['target_ids'])
            self.predicted_ans[incorrect_idx] = predictions[incorrect_idx]
            self.size += 1
            
        self.get_error_df()
        
    def get_example(self, incorrect_idx):
        print(incorrect_idx, "-", self.qq[incorrect_idx].replace("options:", "\n\noptions:")\
             .replace("A:", "\nA:")\
             .replace("B:", "\nB:")\
             .replace("C:", "\nC:")\
             .replace("D:", "\nD:")\
             .replace("E:", "\nE:"))
        print("\n==> Correct ans:  ",   self.target_ans[incorrect_idx])
        print("==> Predicted ans:", self.predicted_ans[incorrect_idx])
        
    def get_error_df(self):
        drows=[]
        for idx in self.incorrect_idxs:
            drow=[]
            drow.append(idx)
            drow.extend(self.qq[idx]\
                 .replace("A:", "\nA:")\
                 .replace("B:", "\nB:")\
                 .replace("C:", "\nC:")\
                 .replace("D:", "\nD:")\
                 .replace("E:", "\nE:").split("\n"))
            drow.append(self.target_ans[idx])
            drow.append(self.predicted_ans[idx])
            drows.append(drow)

        self.error_df = pd.DataFrame(drows, 
                                     columns=["idx", "question", 
                                              "A", "B", "C", "D", "E", 
                                              "target", "predicted"]).set_index("idx")
        

## T5 Base

For Sonali to pickle and provide

In [None]:
modeldir = ""
t5_base_ans = Answers(predict_results(modeldir))
with open('t5_base_ans', 'wb') as f:
    pickle.dump(t5_base_ans, f)

## T5 Base + Hella SWAG

For Sonali to pickle and provide

In [None]:
modeldir = ""
t5_hel_ans = Answers(predict_results(modeldir))
with open('t5_hel_ans', 'wb') as f:
    pickle.dump(t5_hel_ans, f)

## T5 Base + Cosmos QA

In [11]:
modeldir = "./models/cs_on_cosmos/batch_8_lr_5e-5_wu_0_epochs10"
t5_cos_predictions, t5_cos_targets, t5_cos_tokenizer = predict_results(modeldir)

HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




In [52]:
t5_cos_ans = Answers(t5_cos_predictions, t5_cos_targets, t5_cos_tokenizer)

In [42]:
t5_cos_ans.size


454

In [43]:
t5_cos_ans.incorrect_idxs[0:10]

[0, 2, 4, 9, 10, 16, 17, 19, 22, 23]

In [54]:
t5_cos_ans.get_example(2)

2 - question: Where would you find magazines along side many other printed works? 

options: 
A: doctor 
B: bookstore 
C: market 
D: train station 
E: mortuary

==> Predicted ans: C: market
==> Correct ans:   B: bookstore


In [63]:
with open('t5_cos_ans', 'wb') as f:
    pickle.dump(t5_cos_ans, f)

## T5 base + Social I QA

In [55]:
modeldir = "./models/cs_on_social/4_epochs_nonstop_batch8_lr1e-4_cs_10_e_b8_1e-4_wu0_300cp"
t5_so_predictions, t5_so_targets, t5_so_tokenizer = predict_results(modeldir)

HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




In [56]:
t5_so_ans = Answers(t5_so_predictions, t5_so_targets, t5_so_tokenizer)

In [57]:
t5_so_ans.size

465

In [58]:
t5_so_ans.incorrect_idxs[0:10]

[0, 4, 5, 9, 10, 16, 17, 19, 20, 22]

In [61]:
t5_so_ans.get_example(0)

0 - question: A revolving door is convenient for two direction travel, but it also serves as a security measure at a what? 

options: 
A: bank 
B: library 
C: department store 
D: mall 
E: new york

==> Predicted ans: D: mall
==> Correct ans:   A: bank


In [62]:
with open('t5_so_ans', 'wb') as f:
    pickle.dump(t5_so_ans, f)

## Data load

In [1]:
import pickle

In [110]:
with open("t5_so_ans", 'rb') as f:
    t5_so_ans = pickle.load(f)
    
t5_so_ans.get_error_df()

In [111]:
t5_so_ans.size

465

In [122]:
from random import sample 

In [113]:
max(t5_so_ans.error_df.index)

1215

In [115]:
t5_so_ans.error_df.shape

(465, 8)

In [116]:
with open("t5_cos_ans", 'rb') as f:
    t5_cos_ans = pickle.load(f)
    
t5_cos_ans.get_error_df()

In [117]:

t5_cos_ans.size

454

In [124]:
t5_cos_ans.error_df.shape

(454, 8)

In [126]:
t5_cos_ans.error_df.loc[sample(t5_cos_ans.incorrect_idxs, 100)]

Unnamed: 0_level_0,question,A,B,C,D,E,target,predicted
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
648,question: The pencil sharpener was broken in t...,A: home,B: library,C: stationery store,D: cabinet,E: desk drawer,B: library,E: desk drawer
1010,question: If a small flying animal picks up a ...,A: bird's nest,B: park,C: guitar,D: kite,E: quark,A: bird's nest,D: kite
10,question: What would vinyl be an odd thing to ...,A: pants,B: record albums,C: record store,D: cheese,E: wallpaper,E: wallpaper,B: record albums
212,question: Where do you go on a night out befor...,A: new york city,B: las vegas,C: restaurant,D: nightclub,E: park,C: restaurant,D: nightclub
1074,question: What would it be if they get a surpr...,A: surprise,B: fight,C: annoyance,D: might scare,E: irritated,C: annoyance,A: surprise
...,...,...,...,...,...,...,...,...
275,question: What has a surface with many sides? ...,A: tetrahedron,B: object,C: geometry problem,D: lake,E: triangle,A: tetrahedron,E: triangle
245,question: The fact that Joe was able to memori...,A: awake,B: repeat,C: sleeping,D: concentrate,E: read aloud,A: awake,D: concentrate
1024,question: Where can a bath towel be borrowed? ...,A: cupboard,B: at hotel,C: swimming pool,D: clothes line,E: backpack,B: at hotel,C: swimming pool
875,question: What do geese do every fall in field...,A: guard house,B: fly,C: eat,D: follow ultralight airplane,E: group together,E: group together,B: fly


In [127]:
t5_cos_ans.error_df.loc[sample(t5_cos_ans.incorrect_idxs, 100)].to_csv("t5_cos_ans_errors.csv")
t5_so_ans.error_df.loc[sample(t5_so_ans.incorrect_idxs, 100)].to_csv("t5_so_ans_errors.csv")


In [119]:
counter = 0

for soc_idx in t5_so_ans.incorrect_idxs:
    if soc_idx in t5_cos_ans.incorrect_idxs:
        counter += 1
        
counter

368

Between the Social IQA model and the Cosmos QA model, they share 368 questions that they got wrong. 

In [23]:

counter = 0

while counter <= 10:
    for soc_idx in t5_so_ans.incorrect_idxs:
        if soc_idx not in t5_cos_ans.incorrect_idxs:
            print(t5_so_ans.get_example(soc_idx))
            counter += 1


5 - question: What island country is ferret popular? 

options: 
A: own home 
B: north carolina 
C: great britain 
D: hutch 
E: outdoors

==> Correct ans:   C: great britain
==> Predicted ans: B: north carolina
None
20 - question: What could go on top of wood? 

options: 
A: lumberyard 
B: synagogue 
C: floor 
D: carpet 
E: hardware store

==> Correct ans:   D: carpet
==> Predicted ans: C: floor
None
48 - question: What is a place that usually does not have an elevator and that sometimes has a telephone book? 

options: 
A: at hotel 
B: kitchen 
C: library 
D: telephone booth 
E: house

==> Correct ans:   E: house
==> Predicted ans: C: library
None
66 - question: He was beginning to regret taking the fight when he saw how what his opponent was? 

options: 
A: fun 
B: joy 
C: satisfaction 
D: confident 
E: pride

==> Correct ans:   D: confident
==> Predicted ans: C: satisfaction
None
75 - question: If you are awaking multiple times throughout the night because a lot is on your mind, wha

In [24]:

counter = 0

while counter <= 10:
    for idx in t5_cos_ans.incorrect_idxs:
        if idx not in t5_so_ans.incorrect_idxs:
            print(t5_cos_ans.get_example(idx))
            counter += 1


2 - question: Where would you find magazines along side many other printed works? 

options: 
A: doctor 
B: bookstore 
C: market 
D: train station 
E: mortuary

==> Correct ans:   B: bookstore
==> Predicted ans: C: market
None
25 - question: When wildlife reproduce we often refer to what comes out as what? 

options: 
A: raise children 
B: have children 
C: photo copy 
D: offspring 
E: accidently got pregnant somehow

==> Correct ans:   D: offspring
==> Predicted ans: C: photo copy
None
31 - question: James wanted to find an old underground map from the 50s. Where might he look for one? 

options: 
A: library 
B: subway station 
C: county engineer's office 
D: super market 
E: home

==> Correct ans:   A: library
==> Predicted ans: B: subway station
None
32 - question: Sean was in a rush to get home, but the light turned yellow and he was forced to do what? 

options: 
A: take time 
B: dawdle 
C: go slowly 
D: ocean 
E: slow down

==> Correct ans:   E: slow down
==> Predicted ans: A: ta