In [1]:
import os
import re

import pandas as pd
import numpy as np
from datasets import load_dataset

In [2]:
os.chdir('../..')

In [55]:
pd.options.display.max_columns = None

## Check label counts

In [3]:
df_labels = pd.DataFrame()

for dataset in ['SQuAD', 'DBERT', 'DBiDAF', 'DRoBERTa']:

    df = pd.read_csv(f'data/external/Reasoning Types/{dataset}.csv')
    df['id'] = df.index + 1
    df = pd.melt(df, id_vars=['id'], value_vars=['Reasoning Type', 'RT 2', 'RT 3'])
    df['dataset'] = dataset
    
    df_labels = df_labels.append(df)

In [4]:
df_labels.shape

(1200, 4)

In [5]:
df_labels.head()

Unnamed: 0,id,variable,value,dataset
0,1,Reasoning Type,Explicit,SQuAD
1,2,Reasoning Type,Implicit,SQuAD
2,3,Reasoning Type,External Knowledge,SQuAD
3,4,Reasoning Type,Implicit,SQuAD
4,5,Reasoning Type,Implicit,SQuAD


In [6]:
df_labels['value'].isnull().sum()

699

In [7]:
df_labels.dropna(axis=0, subset=['value'], inplace=True)

In [18]:
df_labels['value'].value_counts()

Paraphrasing          120
Explicit               77
Co-reference           61
External Knowledge     59
Implicit               52
Multi-hop              41
Filtering              23
Temporal               20
Spatial                11
Comparative             8
Inductive               8
Superlative             7
Numeric (other)         7
Negation                7
Name: value, dtype: int64

In [23]:
df_labels.groupby(['dataset', 'value']).nunique()['id']

dataset   value             
DBERT     Co-reference           9
          Explicit               6
          External Knowledge    19
          Filtering              7
          Implicit              18
          Inductive              1
          Multi-hop             12
          Negation               3
          Numeric (other)        5
          Paraphrasing          35
          Spatial                5
          Superlative            1
          Temporal               5
DBiDAF    Co-reference          22
          Comparative            6
          Explicit               8
          External Knowledge    12
          Filtering              6
          Implicit              11
          Inductive              1
          Multi-hop              9
          Negation               2
          Numeric (other)        2
          Paraphrasing          41
          Spatial                3
          Superlative            5
          Temporal               6
DRoBERTa  Co-reference    

## SQuAD

In [24]:
squad_reasoning = pd.read_csv('data/external/Reasoning Types/SQuAD.csv')
print(squad_reasoning.shape)
squad_reasoning.head()

(100, 7)


Unnamed: 0,Passage,Question,Answer,Reasoning Type,RT 2,RT 3,Comments
0,"The Central Region, consisting of present-day ...",Where was the Central Secretariat based?,Khanbaliq,Explicit,,,
1,Manning finished the game 13 of 23 for 141 yar...,What is th elast name of the player who was th...,Anderson,Implicit,,,
2,"In business, notable alumni include Microsoft ...",What Goldman Sachs CEO is also an alumni of th...,Jon Corzine,External Knowledge,,,
3,Manning finished the game 13 of 23 for 141 yar...,How many intercpetions did Newton have in Supe...,one,Implicit,,,
4,"The collection of Italian, Medieval, Renaissan...",Who designed the largest item from Italy that ...,Giuliano da Sangallo,Implicit,,,


In [25]:
squad_reasoning['Passage_letters_only'] = squad_reasoning['Passage'].apply(lambda x: re.sub(r"[^a-zA-Z]+", '', x))

In [26]:
squad = load_dataset("squad", split="validation")
squad_df = pd.DataFrame(squad)
print(squad_df.shape)
squad_df.head()

Reusing dataset squad (/Users/stevengeorge/.cache/huggingface/datasets/squad/plain_text/1.0.0/4fffa6cf76083860f85fa83486ec3028e7e32c342c218ff2a620fc6b2868483a)


(10570, 5)


Unnamed: 0,answers,context,id,question,title
0,"{'answer_start': [177, 177, 177], 'text': ['De...",Super Bowl 50 was an American football game to...,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,Super_Bowl_50
1,"{'answer_start': [249, 249, 249], 'text': ['Ca...",Super Bowl 50 was an American football game to...,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,Super_Bowl_50
2,"{'answer_start': [403, 355, 355], 'text': ['Sa...",Super Bowl 50 was an American football game to...,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,Super_Bowl_50
3,"{'answer_start': [177, 177, 177], 'text': ['De...",Super Bowl 50 was an American football game to...,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,Super_Bowl_50
4,"{'answer_start': [488, 488, 521], 'text': ['go...",Super Bowl 50 was an American football game to...,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,Super_Bowl_50


In [27]:
squad_df['id'].nunique()

10570

In [28]:
squad_df['context_letters_only'] = squad_df['context'].apply(lambda x: re.sub(r"[^a-zA-Z]+", '', x))

In [29]:
combined = squad_reasoning.merge(squad_df, left_on=['Passage_letters_only', 'Question'], right_on=['context_letters_only', 'question'], how='left', indicator=True)
print(combined.shape)
combined.head()

(103, 15)


Unnamed: 0,Passage,Question,Answer,Reasoning Type,RT 2,RT 3,Comments,Passage_letters_only,answers,context,id,question,title,context_letters_only,_merge
0,"The Central Region, consisting of present-day ...",Where was the Central Secretariat based?,Khanbaliq,Explicit,,,,TheCentralRegionconsistingofpresentdayHebeiSha...,"{'answer_start': [306, 306, 306], 'text': ['Kh...","The Central Region, consisting of present-day ...",572885c44b864d1900164a7a,Where was the Central Secretariat based?,Yuan_dynasty,TheCentralRegionconsistingofpresentdayHebeiSha...,both
1,Manning finished the game 13 of 23 for 141 yar...,What is th elast name of the player who was th...,Anderson,Implicit,,,,Manningfinishedthegameofforyardswithoneinterce...,"{'answer_start': [155, 155, 155], 'text': ['An...",Manning finished the game 13 of 23 for 141 yar...,56d9ccacdc89441400fdb843,What is th elast name of the player who was th...,Super_Bowl_50,Manningfinishedthegameofforyardswithoneinterce...,both
2,"In business, notable alumni include Microsoft ...",What Goldman Sachs CEO is also an alumni of th...,Jon Corzine,External Knowledge,,,,InbusinessnotablealumniincludeMicrosoftCEOSaty...,"{'answer_start': [217, 217, 217], 'text': ['Jo...","In business, notable alumni include Microsoft ...",57286951ff5b5019007da211,What Goldman Sachs CEO is also an alumni of th...,University_of_Chicago,InbusinessnotablealumniincludeMicrosoftCEOSaty...,both
3,Manning finished the game 13 of 23 for 141 yar...,How many intercpetions did Newton have in Supe...,one,Implicit,,,,Manningfinishedthegameofforyardswithoneinterce...,"{'answer_start': [54, 612, 612], 'text': ['one...",Manning finished the game 13 of 23 for 141 yar...,56d9ccacdc89441400fdb845,How many intercpetions did Newton have in Supe...,Super_Bowl_50,Manningfinishedthegameofforyardswithoneinterce...,both
4,"The collection of Italian, Medieval, Renaissan...",Who designed the largest item from Italy that ...,Giuliano da Sangallo,Implicit,,,,ThecollectionofItalianMedievalRenaissanceBaroq...,"{'answer_start': [1540, 1540, 1489], 'text': [...","The collection of Italian, Medieval, Renaissan...",5726fc63dd62a815002e9709,Who designed the largest item from Italy that ...,Victoria_and_Albert_Museum,ThecollectionofItalianMedievalRenaissanceBaroq...,both


In [30]:
combined['_merge'].value_counts()

both          103
left_only       0
right_only      0
Name: _merge, dtype: int64

In [31]:
combined[combined[['question', 'context']].duplicated()]

Unnamed: 0,Passage,Question,Answer,Reasoning Type,RT 2,RT 3,Comments,Passage_letters_only,answers,context,id,question,title,context_letters_only,_merge
36,European Union law is a body of treaties and l...,What are the main sources of primary law?,the Treaties establishing the European Union,Explicit,,,,EuropeanUnionlawisabodyoftreatiesandlegislatio...,"{'answer_start': [313, 313, 313, 313], 'text':...",European Union law is a body of treaties and l...,57268b43dd62a815002e88f1,What are the main sources of primary law?,European_Union_law,EuropeanUnionlawisabodyoftreatiesandlegislatio...,both
76,The immune system is a system of many biologic...,What does the immune system protect against?,disease,Explicit,,,,Theimmunesystemisasystemofmanybiologicalstruct...,"{'answer_start': [210, 115, 115, 107], 'text':...",The immune system is a system of many biologic...,5726eb76f1498d1400e8efdb,What does the immune system protect against?,Immune_system,Theimmunesystemisasystemofmanybiologicalstruct...,both
94,Carolina suffered a major setback when Thomas ...,How many years has Thomas Davis played in the ...,11,Implicit,,,,CarolinasufferedamajorsetbackwhenThomasDavisan...,"{'answer_start': [56, 56, 56], 'text': ['11', ...",Carolina suffered a major setback when Thomas ...,56bf3a223aeaaa14008c9577,How many years has Thomas Davis played in the ...,Super_Bowl_50,CarolinasufferedamajorsetbackwhenThomasDavisan...,both


In [32]:
squad_df[squad_df['question'] == 'What are the main sources of primary law?']

Unnamed: 0,answers,context,id,question,title,context_letters_only
4065,"{'answer_start': [317, 313, 227, 313], 'text':...",European Union law is a body of treaties and l...,5725b7f389a1e219009abd5e,What are the main sources of primary law?,European_Union_law,EuropeanUnionlawisabodyoftreatiesandlegislatio...
4073,"{'answer_start': [313, 313, 313, 313], 'text':...",European Union law is a body of treaties and l...,57268b43dd62a815002e88f1,What are the main sources of primary law?,European_Union_law,EuropeanUnionlawisabodyoftreatiesandlegislatio...


In [33]:
combined.drop_duplicates(subset=['question', 'context'], inplace=True)
print(combined.shape)
combined.head()

(100, 15)


Unnamed: 0,Passage,Question,Answer,Reasoning Type,RT 2,RT 3,Comments,Passage_letters_only,answers,context,id,question,title,context_letters_only,_merge
0,"The Central Region, consisting of present-day ...",Where was the Central Secretariat based?,Khanbaliq,Explicit,,,,TheCentralRegionconsistingofpresentdayHebeiSha...,"{'answer_start': [306, 306, 306], 'text': ['Kh...","The Central Region, consisting of present-day ...",572885c44b864d1900164a7a,Where was the Central Secretariat based?,Yuan_dynasty,TheCentralRegionconsistingofpresentdayHebeiSha...,both
1,Manning finished the game 13 of 23 for 141 yar...,What is th elast name of the player who was th...,Anderson,Implicit,,,,Manningfinishedthegameofforyardswithoneinterce...,"{'answer_start': [155, 155, 155], 'text': ['An...",Manning finished the game 13 of 23 for 141 yar...,56d9ccacdc89441400fdb843,What is th elast name of the player who was th...,Super_Bowl_50,Manningfinishedthegameofforyardswithoneinterce...,both
2,"In business, notable alumni include Microsoft ...",What Goldman Sachs CEO is also an alumni of th...,Jon Corzine,External Knowledge,,,,InbusinessnotablealumniincludeMicrosoftCEOSaty...,"{'answer_start': [217, 217, 217], 'text': ['Jo...","In business, notable alumni include Microsoft ...",57286951ff5b5019007da211,What Goldman Sachs CEO is also an alumni of th...,University_of_Chicago,InbusinessnotablealumniincludeMicrosoftCEOSaty...,both
3,Manning finished the game 13 of 23 for 141 yar...,How many intercpetions did Newton have in Supe...,one,Implicit,,,,Manningfinishedthegameofforyardswithoneinterce...,"{'answer_start': [54, 612, 612], 'text': ['one...",Manning finished the game 13 of 23 for 141 yar...,56d9ccacdc89441400fdb845,How many intercpetions did Newton have in Supe...,Super_Bowl_50,Manningfinishedthegameofforyardswithoneinterce...,both
4,"The collection of Italian, Medieval, Renaissan...",Who designed the largest item from Italy that ...,Giuliano da Sangallo,Implicit,,,,ThecollectionofItalianMedievalRenaissanceBaroq...,"{'answer_start': [1540, 1540, 1489], 'text': [...","The collection of Italian, Medieval, Renaissan...",5726fc63dd62a815002e9709,Who designed the largest item from Italy that ...,Victoria_and_Albert_Museum,ThecollectionofItalianMedievalRenaissanceBaroq...,both


In [34]:
combined = combined[list(squad_reasoning.columns) + ['id']]
combined.head()

Unnamed: 0,Passage,Question,Answer,Reasoning Type,RT 2,RT 3,Comments,Passage_letters_only,id
0,"The Central Region, consisting of present-day ...",Where was the Central Secretariat based?,Khanbaliq,Explicit,,,,TheCentralRegionconsistingofpresentdayHebeiSha...,572885c44b864d1900164a7a
1,Manning finished the game 13 of 23 for 141 yar...,What is th elast name of the player who was th...,Anderson,Implicit,,,,Manningfinishedthegameofforyardswithoneinterce...,56d9ccacdc89441400fdb843
2,"In business, notable alumni include Microsoft ...",What Goldman Sachs CEO is also an alumni of th...,Jon Corzine,External Knowledge,,,,InbusinessnotablealumniincludeMicrosoftCEOSaty...,57286951ff5b5019007da211
3,Manning finished the game 13 of 23 for 141 yar...,How many intercpetions did Newton have in Supe...,one,Implicit,,,,Manningfinishedthegameofforyardswithoneinterce...,56d9ccacdc89441400fdb845
4,"The collection of Italian, Medieval, Renaissan...",Who designed the largest item from Italy that ...,Giuliano da Sangallo,Implicit,,,,ThecollectionofItalianMedievalRenaissanceBaroq...,5726fc63dd62a815002e9709


In [35]:
combined['Reasoning Type'].value_counts()

Explicit              55
Paraphrasing          15
Co-reference          13
Implicit              10
External Knowledge     5
Multi-hop              2
Name: Reasoning Type, dtype: int64

In [36]:
combined_expanded = pd.melt(combined, id_vars='id', value_vars=['Reasoning Type', 'RT 2', 'RT 3']).sort_values('id')
print(combined_expanded.shape)
combined_expanded.head()

(300, 3)


Unnamed: 0,id,variable,value
291,56beb7fd3aeaaa14008c92b8,RT 3,
191,56beb7fd3aeaaa14008c92b8,RT 2,
91,56beb7fd3aeaaa14008c92b8,Reasoning Type,Implicit
195,56bebad93aeaaa14008c92fb,RT 2,
295,56bebad93aeaaa14008c92fb,RT 3,


In [37]:
combined_expanded.drop('variable', axis=1, inplace=True)
combined_expanded.dropna(subset=['value'], inplace=True)
combined_expanded.reset_index(drop=True, inplace=True)
combined_expanded['dataset'] = 'squad'
print(combined_expanded.shape)
combined_expanded.head()

(112, 3)


Unnamed: 0,id,value,dataset
0,56beb7fd3aeaaa14008c92b8,Implicit,squad
1,56bebad93aeaaa14008c92fb,Paraphrasing,squad
2,56d723560d65d214001983ac,Explicit,squad
3,56d9943fdc89441400fdb577,Explicit,squad
4,56d9c92bdc89441400fdb811,External Knowledge,squad


In [38]:
combined_expanded['value'].value_counts()

Explicit              57
Co-reference          17
Paraphrasing          17
Implicit              10
External Knowledge     8
Multi-hop              2
Temporal               1
Name: value, dtype: int64

## AdversarialQA

In [3]:
reasoning_df = pd.DataFrame()

for dataset in ['DBERT', 'DBiDAF', 'DRoBERTa']:
    
    df = pd.read_csv(f'data/external/Reasoning Types/{dataset}.csv')
    df['dataset'] = dataset
    print(dataset)
    print(df.shape)
    reasoning_df = reasoning_df.append(df)

print(reasoning_df.shape)
reasoning_df.head()

DBERT
(100, 8)
DBiDAF
(100, 8)
DRoBERTa
(100, 8)
(300, 8)


Unnamed: 0,Passage,Question,Answer,Reasoning Type,RT 2,RT 3,Comments,dataset
0,The simplest valve gears give events of fixed ...,Engines move in a circle called an?,engine cycle,Paraphrasing,,,,DBERT
1,"Natural killer cells, or NK cells, are a compo...",How do NK cells recognize tumor and infected c...,the MHC makeup on the surface of those cells i...,Multi-hop,Co-reference,,,DBERT
2,"In economics, notable Nobel Memorial Prize in ...",Samuelson is from the country of what leader?,Ronald Reagan,Multi-hop,,,,DBERT
3,The flagship stations of each station in the m...,What station is not based in the state of the ...,WBT-FM (99.3 FM),External Knowledge,,,,DBERT
4,The Maroons compete in the NCAA's Division III...,Who was the first winner of the Heisman Trophy?,Chicago Maroons football player,Explicit,,,,DBERT


In [4]:
all_datasets_df = pd.DataFrame()

for dataset in ['dbert', 'droberta', 'dbidaf']:
    
    dataset_df = pd.DataFrame()
    
    for split in ['train', 'validation', 'test']:
        split_data = load_dataset("adversarial_qa", dataset, split=split)
        split_df = pd.DataFrame(split_data)
        split_df['split'] = split
        dataset_df = dataset_df.append(split_df)
    
    dataset_df['dataset'] = dataset
    print(dataset)
    print(dataset_df.shape)
    print('\n')
    
    all_datasets_df = all_datasets_df.append(dataset_df)

Reusing dataset adversarial_qa (/Users/stevengeorge/.cache/huggingface/datasets/adversarial_qa/dbert/1.0.0/2f8c1f6e146e203b3fde4b73427f19abf4ab44f315df4ccd7613f03caf3c2971)
Reusing dataset adversarial_qa (/Users/stevengeorge/.cache/huggingface/datasets/adversarial_qa/dbert/1.0.0/2f8c1f6e146e203b3fde4b73427f19abf4ab44f315df4ccd7613f03caf3c2971)
Reusing dataset adversarial_qa (/Users/stevengeorge/.cache/huggingface/datasets/adversarial_qa/dbert/1.0.0/2f8c1f6e146e203b3fde4b73427f19abf4ab44f315df4ccd7613f03caf3c2971)


dbert
(12000, 8)




Reusing dataset adversarial_qa (/Users/stevengeorge/.cache/huggingface/datasets/adversarial_qa/droberta/1.0.0/2f8c1f6e146e203b3fde4b73427f19abf4ab44f315df4ccd7613f03caf3c2971)
Reusing dataset adversarial_qa (/Users/stevengeorge/.cache/huggingface/datasets/adversarial_qa/droberta/1.0.0/2f8c1f6e146e203b3fde4b73427f19abf4ab44f315df4ccd7613f03caf3c2971)
Reusing dataset adversarial_qa (/Users/stevengeorge/.cache/huggingface/datasets/adversarial_qa/droberta/1.0.0/2f8c1f6e146e203b3fde4b73427f19abf4ab44f315df4ccd7613f03caf3c2971)


droberta
(12000, 8)




Reusing dataset adversarial_qa (/Users/stevengeorge/.cache/huggingface/datasets/adversarial_qa/dbidaf/1.0.0/2f8c1f6e146e203b3fde4b73427f19abf4ab44f315df4ccd7613f03caf3c2971)
Reusing dataset adversarial_qa (/Users/stevengeorge/.cache/huggingface/datasets/adversarial_qa/dbidaf/1.0.0/2f8c1f6e146e203b3fde4b73427f19abf4ab44f315df4ccd7613f03caf3c2971)
Reusing dataset adversarial_qa (/Users/stevengeorge/.cache/huggingface/datasets/adversarial_qa/dbidaf/1.0.0/2f8c1f6e146e203b3fde4b73427f19abf4ab44f315df4ccd7613f03caf3c2971)


dbidaf
(12000, 8)




In [5]:
all_datasets_df['split'].value_counts()

train         30000
test           3000
validation     3000
Name: split, dtype: int64

In [6]:
reasoning_df.merge(all_datasets_df, left_on='Question', right_on='question').shape

(98, 16)

In [7]:
reasoning_df.merge(all_datasets_df, left_on='Passage', right_on='context').shape

(933, 16)

Format questions and contexts (remove non-letters, lowercase and strip whitespace)

In [8]:
reasoning_df['Question_fmt'] = reasoning_df['Question'].apply(lambda x: x.lower().strip())
reasoning_df['Passage_fmt'] = reasoning_df['Passage'].apply(lambda x: x.lower().strip())

reasoning_df['Passage_fmt'] = reasoning_df['Passage_fmt'].apply(lambda x: re.sub(r"[^a-zA-Z]+", '', x))
reasoning_df['Question_fmt'] = reasoning_df['Question_fmt'].apply(lambda x: re.sub(r"[^a-zA-Z]+", '', x))

In [9]:
all_datasets_df['question_fmt'] = all_datasets_df['question'].apply(lambda x: x.lower().strip())
all_datasets_df['context_fmt'] = all_datasets_df['context'].apply(lambda x: x.lower().strip())

all_datasets_df['context_fmt'] = all_datasets_df['context_fmt'].apply(lambda x: re.sub(r"[^a-zA-Z]+", '', x))
all_datasets_df['question_fmt'] = all_datasets_df['question_fmt'].apply(lambda x: re.sub(r"[^a-zA-Z]+", '', x))

In [10]:
reasoning_df.shape

(300, 10)

1. See contexts in common

In [11]:
reasoning_df_contexts = reasoning_df.copy()[['Passage_fmt']]
reasoning_df_contexts.rename(columns={'Passage_fmt': 'context_fmt'}, inplace=True)
reasoning_df_contexts.drop_duplicates(inplace=True)
print(reasoning_df_contexts.shape)
reasoning_df_contexts.head()

(114, 1)


Unnamed: 0,context_fmt
0,thesimplestvalvegearsgiveeventsoffixedlengthdu...
1,naturalkillercellsornkcellsareacomponentofthei...
2,ineconomicsnotablenobelmemorialprizeineconomic...
3,theflagshipstationsofeachstationinthemarketsof...
4,themaroonscompeteinthencaasdivisioniiiasmember...


In [12]:
print(all_datasets_df.shape)
all_datasets_df_contexts = all_datasets_df.copy()[['context_fmt']]
all_datasets_df_contexts.drop_duplicates(inplace=True)
print(all_datasets_df_contexts.shape)
all_datasets_df_contexts.head()

(36000, 10)
(3473, 1)


Unnamed: 0,context_fmt
0,agenerationlatertheirishanglicanbishopgeorgebe...
5,humemaintainedthatallknowledgeeventhemostbasic...
8,philosophicalempiricistsholdnoknowledgetobepro...
13,inthelatethandearlythcenturyseveralformsofprag...
17,millsempiricismthusheldthatknowledgeofanykindi...


In [13]:
combined = reasoning_df_contexts.merge(all_datasets_df_contexts, on='context_fmt', how='left', indicator=True)
print(combined.shape)
combined.head()

(114, 2)


Unnamed: 0,context_fmt,_merge
0,thesimplestvalvegearsgiveeventsoffixedlengthdu...,both
1,naturalkillercellsornkcellsareacomponentofthei...,both
2,ineconomicsnotablenobelmemorialprizeineconomic...,both
3,theflagshipstationsofeachstationinthemarketsof...,both
4,themaroonscompeteinthencaasdivisioniiiasmember...,both


In [14]:
combined['_merge'].value_counts()

both          112
left_only       2
right_only      0
Name: _merge, dtype: int64

In [15]:
combined[combined['_merge'] == 'left_only'].head()

Unnamed: 0,context_fmt,_merge
78,theinvertedrepeatregionsarehighlyconservedamon...,left_only
88,theroleofteacherisoftenformalandongoingcarried...,left_only


In [18]:
reasoning_df[reasoning_df['Passage_fmt'].isin(combined[combined['_merge'] == 'left_only']['context_fmt'])]

Unnamed: 0,Passage,Question,Answer,Reasoning Type,RT 2,RT 3,Comments,dataset,Question_fmt,Passage_fmt
13,The inverted repeat regions are highly conserv...,What came prior to the chloroplast?,cyanobacteria,Paraphrasing,,,,DBiDAF,whatcamepriortothechloroplast,theinvertedrepeatregionsarehighlyconservedamon...
39,The role of teacher is often formal and ongoin...,What might be an area of study needed to becom...,pedagogy,Co-reference,,,,DBiDAF,whatmightbeanareaofstudyneededtobecomeateacher,theroleofteacherisoftenformalandongoingcarried...
54,The inverted repeat regions are highly conserv...,Which is changed more?,chloroplast DNAs which have lost some of the i...,Comparative,Paraphrasing,,,DBiDAF,whichischangedmore,theinvertedrepeatregionsarehighlyconservedamon...
90,The role of teacher is often formal and ongoin...,What do teachers have in common with other pro...,professional development,Paraphrasing,,,,DBiDAF,whatdoteachershaveincommonwithotherprofessionals,theroleofteacherisoftenformalandongoingcarried...
48,The inverted repeat regions are highly conserv...,What are the two lineages and genome called?,Similar inverted repeats exist in the genomes ...,,,,Bad question,DRoBERTa,whatarethetwolineagesandgenomecalled,theinvertedrepeatregionsarehighlyconservedamon...
83,The inverted repeat regions are highly conserv...,What is the chloroplast only?,suggesting,,,,Bad question,DRoBERTa,whatisthechloroplastonly,theinvertedrepeatregionsarehighlyconservedamon...


In [19]:
reasoning_df[reasoning_df['Passage_fmt'].isin(combined[combined['_merge'] == 'left_only']['context_fmt'])]['Passage'].iloc[0]

'The inverted repeat regions are highly conserved among land plants, and accumulate few mutations. Similar inverted repeats exist in the genomes of cyanobacteria and the other two chloroplast lineages (glaucophyta and rhodophyce√¶), suggesting that they predate the chloroplast, though some chloroplast DNAs have since lost or flipped the inverted repeats (making them direct repeats). It is possible that the inverted repeats help stabilize the rest of the chloroplast genome, as chloroplast DNAs which have lost some of the inverted repeat segments tend to get rearranged more.'

In [20]:
reasoning_df[reasoning_df['Passage_fmt'].isin(combined[combined['_merge'] == 'left_only']['context_fmt'])]['Passage_fmt'].iloc[0]

'theinvertedrepeatregionsarehighlyconservedamonglandplantsandaccumulatefewmutationssimilarinvertedrepeatsexistinthegenomesofcyanobacteriaandtheothertwochloroplastlineagesglaucophytaandrhodophycesuggestingthattheypredatethechloroplastthoughsomechloroplastdnashavesincelostorflippedtheinvertedrepeatsmakingthemdirectrepeatsitispossiblethattheinvertedrepeatshelpstabilizetherestofthechloroplastgenomeaschloroplastdnaswhichhavelostsomeoftheinvertedrepeatsegmentstendtogetrearrangedmore'

In [24]:
all_datasets_df[all_datasets_df['context'].apply(lambda x: 'The inverted repeat regions' in x)]

Unnamed: 0,answers,context,id,metadata,question,title,split,dataset,question_fmt,context_fmt


In [25]:
all_datasets_df[all_datasets_df['context'].apply(lambda x: 'The role of teacher' in x)]

Unnamed: 0,answers,context,id,metadata,question,title,split,dataset,question_fmt,context_fmt


2. See questions in common

In [29]:
reasoning_df.shape

(300, 10)

In [30]:
reasoning_df_questions = reasoning_df.copy()[['Question_fmt']]
reasoning_df_questions.rename(columns={'Question_fmt': 'question_fmt'}, inplace=True)
reasoning_df_questions.drop_duplicates(inplace=True)
print(reasoning_df_questions.shape)
reasoning_df_questions.head()

(300, 1)


Unnamed: 0,question_fmt
0,enginesmoveinacirclecalledan
1,howdonkcellsrecognizetumorandinfectedcells
2,samuelsonisfromthecountryofwhatleader
3,whatstationisnotbasedinthestateoftheteamwhoseg...
4,whowasthefirstwinneroftheheismantrophy


In [31]:
print(all_datasets_df.shape)
all_datasets_df_questions = all_datasets_df.copy()[['question_fmt']]
all_datasets_df_questions.drop_duplicates(inplace=True)
print(all_datasets_df_questions.shape)
all_datasets_df_questions.head()

(36000, 10)
(35009, 1)


Unnamed: 0,question_fmt
0,whatconceptismentionedlast
1,whatconceptismentionedfirst
2,whatgroupismentionedlast
3,whattimeperiodismentionedsecond
4,whatworkofwritingismentionedlast


In [32]:
combined = reasoning_df_questions.merge(all_datasets_df_questions, on='question_fmt', how='left', indicator=True)
print(combined.shape)
combined.head()

(300, 2)


Unnamed: 0,question_fmt,_merge
0,enginesmoveinacirclecalledan,left_only
1,howdonkcellsrecognizetumorandinfectedcells,left_only
2,samuelsonisfromthecountryofwhatleader,left_only
3,whatstationisnotbasedinthestateoftheteamwhoseg...,left_only
4,whowasthefirstwinneroftheheismantrophy,left_only


In [33]:
combined['_merge'].value_counts()

left_only     198
both          102
right_only      0
Name: _merge, dtype: int64

In [34]:
combined[combined['_merge'] == 'left_only'].head()

Unnamed: 0,question_fmt,_merge
0,enginesmoveinacirclecalledan,left_only
1,howdonkcellsrecognizetumorandinfectedcells,left_only
2,samuelsonisfromthecountryofwhatleader,left_only
3,whatstationisnotbasedinthestateoftheteamwhoseg...,left_only
4,whowasthefirstwinneroftheheismantrophy,left_only


In [35]:
reasoning_df[reasoning_df['Question_fmt'].isin(combined[combined['_merge'] == 'left_only']['question_fmt'])]

Unnamed: 0,Passage,Question,Answer,Reasoning Type,RT 2,RT 3,Comments,dataset,Question_fmt,Passage_fmt
0,The simplest valve gears give events of fixed ...,Engines move in a circle called an?,engine cycle,Paraphrasing,,,,DBERT,enginesmoveinacirclecalledan,thesimplestvalvegearsgiveeventsoffixedlengthdu...
1,"Natural killer cells, or NK cells, are a compo...",How do NK cells recognize tumor and infected c...,the MHC makeup on the surface of those cells i...,Multi-hop,Co-reference,,,DBERT,howdonkcellsrecognizetumorandinfectedcells,naturalkillercellsornkcellsareacomponentofthei...
2,"In economics, notable Nobel Memorial Prize in ...",Samuelson is from the country of what leader?,Ronald Reagan,Multi-hop,,,,DBERT,samuelsonisfromthecountryofwhatleader,ineconomicsnotablenobelmemorialprizeineconomic...
3,The flagship stations of each station in the m...,What station is not based in the state of the ...,WBT-FM (99.3 FM),External Knowledge,,,,DBERT,whatstationisnotbasedinthestateoftheteamwhoseg...,theflagshipstationsofeachstationinthemarketsof...
4,The Maroons compete in the NCAA's Division III...,Who was the first winner of the Heisman Trophy?,Chicago Maroons football player,Explicit,,,,DBERT,whowasthefirstwinneroftheheismantrophy,themaroonscompeteinthencaasdivisioniiiasmember...
...,...,...,...,...,...,...,...,...,...,...
89,A method to lessen the magnitude of this heati...,What is unique about the Woolf high-pressure c...,less heat is lost by the steam,Multi-hop,,,,DRoBERTa,whatisuniqueaboutthewoolfhighpressurecompounde...,amethodtolessenthemagnitudeofthisheatingandcoo...
90,East and Central Africa's biggest economy has ...,How has the economy in East and Central Africa...,rapid expansion in telecommunication and finan...,Paraphrasing,,,,DRoBERTa,howhastheeconomyineastandcentralafricagrown,eastandcentralafricasbiggesteconomyhaspostedtr...
93,"Natural killer cells, or NK cells, are a compo...",Host cells can be destroyed by?,Natural killer cells,Multi-hop,,,,DRoBERTa,hostcellscanbedestroyedby,naturalkillercellsornkcellsareacomponentofthei...
94,"Years before his death, Genghis Khan asked to ...",The Genghis Khan Mausoleum does not have a?,body,Multi-hop,Negation,,,DRoBERTa,thegenghiskhanmausoleumdoesnothavea,yearsbeforehisdeathgenghiskhanaskedtobeburiedw...


In [36]:
reasoning_df[reasoning_df['Question_fmt'].isin(combined[combined['_merge'] == 'left_only']['question_fmt'])]['Question'].iloc[0]

'Engines move in a circle called an?'

In [37]:
reasoning_df[reasoning_df['Question_fmt'].isin(combined[combined['_merge'] == 'left_only']['question_fmt'])]['Question_fmt'].iloc[0]

'enginesmoveinacirclecalledan'

In [43]:
all_datasets_df[all_datasets_df['question'].apply(lambda x: 'engines move' in x)]

Unnamed: 0,answers,context,id,metadata,question,title,split,dataset,question_fmt,context_fmt


In [49]:
all_datasets_df[all_datasets_df['question'].apply(lambda x: 'Genghis Khan Mausoleum' in x)]

Unnamed: 0,answers,context,id,metadata,question,title,split,dataset,question_fmt,context_fmt


3. Combine on context _and_ question

In [51]:
all_datasets_df.head()

Unnamed: 0,answers,context,id,metadata,question,title,split,dataset,question_fmt,context_fmt
0,"{'answer_start': [742], 'text': ['subjective i...","A generation later, the Irish Anglican bishop,...",dab017ed8a1c27c6afa2d8618abc3a477a4edffc,"{'split': 'train', 'model_in_the_loop': 'BERT-...",what concept is mentioned last?,Empiricism,train,dbert,whatconceptismentionedlast,agenerationlatertheirishanglicanbishopgeorgebe...
1,"{'answer_start': [159], 'text': ['atheism']}","A generation later, the Irish Anglican bishop,...",2fb55cf1439a1a50ee47193124d86c03bf5d5128,"{'split': 'train', 'model_in_the_loop': 'BERT-...",what concept is mentioned first?,Empiricism,train,dbert,whatconceptismentionedfirst,agenerationlatertheirishanglicanbishopgeorgebe...
2,"{'answer_start': [614], 'text': ['humans']}","A generation later, the Irish Anglican bishop,...",7b26ca94c04cd61cf1aaed8a5ed1039be4981041,"{'split': 'train', 'model_in_the_loop': 'BERT-...",what group is mentioned last?,Empiricism,train,dbert,whatgroupismentionedlast,agenerationlatertheirishanglicanbishopgeorgebe...
3,"{'answer_start': [265], 'text': ['1710']}","A generation later, the Irish Anglican bishop,...",2e8a1099995928e2cb9910c6f3e9b21ce4f50016,"{'split': 'train', 'model_in_the_loop': 'BERT-...",what time period is mentioned second?,Empiricism,train,dbert,whattimeperiodismentionedsecond,agenerationlatertheirishanglicanbishopgeorgebe...
4,"{'answer_start': [568], 'text': ['Alciphron']}","A generation later, the Irish Anglican bishop,...",50b7401b594883ee57661f2fdda6ac264b64ca56,"{'split': 'train', 'model_in_the_loop': 'BERT-...",what work of writing is mentioned last?,Empiricism,train,dbert,whatworkofwritingismentionedlast,agenerationlatertheirishanglicanbishopgeorgebe...


In [58]:
combined = reasoning_df.merge(all_datasets_df, left_on=['Question_fmt', 'Passage_fmt'], right_on=['question_fmt', 'context_fmt'], how='left', indicator=True, suffixes=('_reasoning', '_datasets'))
print(combined.shape)
combined.head()

(300, 21)


Unnamed: 0,Passage,Question,Answer,Reasoning Type,RT 2,RT 3,Comments,dataset_reasoning,Question_fmt,Passage_fmt,answers,context,id,metadata,question,title,split,dataset_datasets,question_fmt,context_fmt,_merge
0,The simplest valve gears give events of fixed ...,Engines move in a circle called an?,engine cycle,Paraphrasing,,,,DBERT,enginesmoveinacirclecalledan,thesimplestvalvegearsgiveeventsoffixedlengthdu...,,,,,,,,,,,left_only
1,"Natural killer cells, or NK cells, are a compo...",How do NK cells recognize tumor and infected c...,the MHC makeup on the surface of those cells i...,Multi-hop,Co-reference,,,DBERT,howdonkcellsrecognizetumorandinfectedcells,naturalkillercellsornkcellsareacomponentofthei...,,,,,,,,,,,left_only
2,"In economics, notable Nobel Memorial Prize in ...",Samuelson is from the country of what leader?,Ronald Reagan,Multi-hop,,,,DBERT,samuelsonisfromthecountryofwhatleader,ineconomicsnotablenobelmemorialprizeineconomic...,,,,,,,,,,,left_only
3,The flagship stations of each station in the m...,What station is not based in the state of the ...,WBT-FM (99.3 FM),External Knowledge,,,,DBERT,whatstationisnotbasedinthestateoftheteamwhoseg...,theflagshipstationsofeachstationinthemarketsof...,,,,,,,,,,,left_only
4,The Maroons compete in the NCAA's Division III...,Who was the first winner of the Heisman Trophy?,Chicago Maroons football player,Explicit,,,,DBERT,whowasthefirstwinneroftheheismantrophy,themaroonscompeteinthencaasdivisioniiiasmember...,,,,,,,,,,,left_only


In [59]:
combined['_merge'].value_counts()

left_only     198
both          102
right_only      0
Name: _merge, dtype: int64