In [1]:
import os
import logging

import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import load_dataset, load_metric

In [2]:
os.getcwd()

'/SAN/intelsys/rclearn/when-do-reading-comprehension-models-learn/notebooks/exploration'

In [3]:
os.chdir('../..')

In [4]:
from src.analysis.generate_plot_data import generate_predictions_df
from src.analysis.utils import load_squadv1_dev_as_df

In [5]:
predictions_df = generate_predictions_df('predictions/albert-xlarge-v2-squadv1-adversarialall-wu=100-lr=3e5-bs=32-msl=384-seed=27-dbert', seed=27)
print(predictions_df.shape)
predictions_df.head()

2021-08-03 23:51:25,004 - Loading predictions data


(120000, 4)


Unnamed: 0,id,prediction_text,checkpoint,seed
0,21a3561031f2e909338196601f5c1ac2e08905b6,", twice the figure for the average profession",1,27
1,b6a71a728a35506dd4cd2179c9342acf5e1a047a,empty,1,27
2,6febf2fec8ae1b105a1b955897415fea4b2c0c28,empty,1,27
3,53e4ce951aeab3a2712aa0966d73ec3e2c62ca11,42% of UK teachers experienced occupational st...,1,27
4,a0a308febc5e9edf56fe63556cf20c039c3fdaca,empty,1,27


In [6]:
def load_adversarial_dev_as_df(dmodel):
    logging.info(f"Loading {dmodel} dev data as DataFrame")
    adversarial_val = load_dataset('adversarial_qa', dmodel, split='validation')

    adversarial_val_df = pd.DataFrame(adversarial_val)

    logging.info(adversarial_val_df.shape)
    logging.info(adversarial_val_df.head())

    return adversarial_val_df

In [19]:
%%capture
labels_df = load_adversarial_dev_as_df('droberta')

2021-08-04 00:01:49,856 - Loading droberta dev data as DataFrame
2021-08-04 00:01:50,601 - generating examples from = /home/sgeorge/.cache/huggingface/datasets/downloads/extracted/d105f181804f029a5ff71394c53a0f314901baabcade805bcc4d5e04ccd37a7e/3_droberta/train.json
2021-08-04 00:01:51,799 - generating examples from = /home/sgeorge/.cache/huggingface/datasets/downloads/extracted/d105f181804f029a5ff71394c53a0f314901baabcade805bcc4d5e04ccd37a7e/3_droberta/dev.json
2021-08-04 00:01:51,949 - generating examples from = /home/sgeorge/.cache/huggingface/datasets/downloads/extracted/d105f181804f029a5ff71394c53a0f314901baabcade805bcc4d5e04ccd37a7e/3_droberta/test.json
2021-08-04 00:01:52,230 - (1000, 6)
2021-08-04 00:01:52,231 -                                          answers  \
0  {'text': ['soybeans'], 'answer_start': [386]}   
1    {'text': ['Brazil'], 'answer_start': [338]}   
2   {'text': ['logging'], 'answer_start': [720]}   
3     {'text': ['Water'], 'answer_start': [139]}   
4     {'te

In [8]:
combined = predictions_df.merge(labels_df, on='id', how='inner')
print(combined.shape)
combined.head()

(120000, 9)


Unnamed: 0,id,prediction_text,checkpoint,seed,answers,context,metadata,question,title
0,21a3561031f2e909338196601f5c1ac2e08905b6,", twice the figure for the average profession",1,27,"{'text': ['average profession'], 'answer_start...",A 2000 study found that 42% of UK teachers exp...,"{'split': 'validation', 'model_in_the_loop': '...",What is teaching not considered due to stress?,Teacher
1,21a3561031f2e909338196601f5c1ac2e08905b6,", twice the figure for the average profession",2,27,"{'text': ['average profession'], 'answer_start...",A 2000 study found that 42% of UK teachers exp...,"{'split': 'validation', 'model_in_the_loop': '...",What is teaching not considered due to stress?,Teacher
2,21a3561031f2e909338196601f5c1ac2e08905b6,", twice the figure for the average profession",3,27,"{'text': ['average profession'], 'answer_start...",A 2000 study found that 42% of UK teachers exp...,"{'split': 'validation', 'model_in_the_loop': '...",What is teaching not considered due to stress?,Teacher
3,21a3561031f2e909338196601f5c1ac2e08905b6,", twice the figure for the average profession",4,27,"{'text': ['average profession'], 'answer_start...",A 2000 study found that 42% of UK teachers exp...,"{'split': 'validation', 'model_in_the_loop': '...",What is teaching not considered due to stress?,Teacher
4,21a3561031f2e909338196601f5c1ac2e08905b6,the average profession,5,27,"{'text': ['average profession'], 'answer_start...",A 2000 study found that 42% of UK teachers exp...,"{'split': 'validation', 'model_in_the_loop': '...",What is teaching not considered due to stress?,Teacher


In [9]:
combined = combined.head(100)

In [10]:
from src.analysis.utils import squad2_evaluation, squad1_evaluation

In [11]:
%%time
metric_list = []

for _, row in tqdm(combined.iterrows(), total=combined.shape[0]):
    metrics = squad1_evaluation(
        row[['id']], 
        row[['prediction_text']],
        row[['answers']]
    )
    metrics['id'] = row['id']
    metric_list.append(metrics)

  0%|          | 0/100 [00:00<?, ?it/s]2021-08-03 23:51:28,477 - Removing /home/sgeorge/.cache/huggingface/metrics/squad/default/default_experiment-1-0.arrow
2021-08-03 23:51:28,487 - Removing /home/sgeorge/.cache/huggingface/metrics/squad/default/default_experiment-1-0.arrow
2021-08-03 23:51:28,497 - Removing /home/sgeorge/.cache/huggingface/metrics/squad/default/default_experiment-1-0.arrow
2021-08-03 23:51:28,506 - Removing /home/sgeorge/.cache/huggingface/metrics/squad/default/default_experiment-1-0.arrow
2021-08-03 23:51:28,516 - Removing /home/sgeorge/.cache/huggingface/metrics/squad/default/default_experiment-1-0.arrow
2021-08-03 23:51:28,543 - Removing /home/sgeorge/.cache/huggingface/metrics/squad/default/default_experiment-1-0.arrow
2021-08-03 23:51:28,557 - Removing /home/sgeorge/.cache/huggingface/metrics/squad/default/default_experiment-1-0.arrow
2021-08-03 23:51:28,570 - Removing /home/sgeorge/.cache/huggingface/metrics/squad/default/default_experiment-1-0.arrow
  8%|▊   

CPU times: user 869 ms, sys: 78.5 ms, total: 947 ms
Wall time: 1.69 s





In [12]:
metrics_df = pd.DataFrame(metric_list)
metrics_df.head()

Unnamed: 0,exact_match,f1,id
0,0.0,57.142857,21a3561031f2e909338196601f5c1ac2e08905b6
1,0.0,57.142857,21a3561031f2e909338196601f5c1ac2e08905b6
2,0.0,57.142857,21a3561031f2e909338196601f5c1ac2e08905b6
3,0.0,57.142857,21a3561031f2e909338196601f5c1ac2e08905b6
4,100.0,100.0,21a3561031f2e909338196601f5c1ac2e08905b6


In [13]:
combined = combined.merge(metrics_df, on='id')

In [15]:
combined = combined[['id', 'checkpoint', 'seed', 'exact_match', 'f1']]
combined.head()

Unnamed: 0,id,checkpoint,seed,exact_match,f1
0,21a3561031f2e909338196601f5c1ac2e08905b6,1,27,0.0,57.142857
1,21a3561031f2e909338196601f5c1ac2e08905b6,1,27,0.0,57.142857
2,21a3561031f2e909338196601f5c1ac2e08905b6,1,27,0.0,57.142857
3,21a3561031f2e909338196601f5c1ac2e08905b6,1,27,0.0,57.142857
4,21a3561031f2e909338196601f5c1ac2e08905b6,1,27,100.0,100.0


In [16]:
combined['dataset'] = 'dbert'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
combined.head()

Unnamed: 0,id,checkpoint,seed,exact_match,f1,dataset
0,21a3561031f2e909338196601f5c1ac2e08905b6,1,27,0.0,57.142857,dbert
1,21a3561031f2e909338196601f5c1ac2e08905b6,1,27,0.0,57.142857,dbert
2,21a3561031f2e909338196601f5c1ac2e08905b6,1,27,0.0,57.142857,dbert
3,21a3561031f2e909338196601f5c1ac2e08905b6,1,27,0.0,57.142857,dbert
4,21a3561031f2e909338196601f5c1ac2e08905b6,1,27,100.0,100.0,dbert


In [None]:
combined.to_csv(ind)

In [17]:
metrics

{'exact_match': 0.0, 'f1': 0.0}

In [19]:
pd.DataFrame([metrics, {'exact_match': 4.0, 'f1': 95.0}])

Unnamed: 0,exact_match,f1
0,0.0,0.0
1,4.0,95.0


In [50]:
ex = combined.sample(n=1)
ex

Unnamed: 0,id,prediction_text,checkpoint,seed,answers,context,metadata,question,title
105942,f3aa1b86ade4042fafbc444714911316c82238df,Warsaw area enlargement,6668,27,"{'text': ['area enlargement'], 'answer_start':...","In 1939, c. 1,300,000 people lived in Warsaw, ...","{'split': 'validation', 'model_in_the_loop': '...","After the war, how did they first try to remed...",Warsaw


In [67]:
squad2_evaluation(
    ex[['id']], 
    ex[['prediction_text']],
    ex[['answers']]
)

KeyError: 'w'

In [58]:
squad1_evaluation(
    combined.iloc[0]['id'], 
    combined.iloc[0]['prediction_text'],
    combined.iloc[0]['answers']
)

KeyError: 't'

In [66]:
combined.iloc[0][['prediction_text']].values

array([', twice the figure for the average profession'], dtype=object)