In [167]:
# Import block
import pandas as pd
import altair as alt
import datasets
from transformers import T5TokenizerFast
from datasets import load_dataset

alt.data_transformers.enable('default', max_rows=50000)

DataTransformerRegistry.enable('default')

In [48]:
# Gather data
df_base_beam = pd.read_pickle('df_baseline256.pkl')
df_base_nucleus = pd.read_pickle('df_baseline256_NS.pkl')
df_GPP_beam = pd.read_pickle('df_GPP256.pkl')
df_GPP_nucleus = pd.read_pickle('df_GPP256_NS.pkl')

# Combine
d = {'Reference' : df_base_beam['Reference'].to_list(),
'Answer': df_base_beam['Answer'].to_list(),
'Context': df_base_beam['Context'].to_list(),
'Base.BS.Prediction': df_base_beam['Prediction'].to_list(),
'Base.BS.BLEU': df_base_beam['BLEU'].to_list(),
'Base.BS.SacreBLEU': df_base_beam['SacreBLEU'].to_list(),
'Base.BS.METEOR': df_base_beam['METEOR'].to_list(),
'Base.BS.ROUGE': df_base_beam['ROUGE'].to_list(),
'Base.BS.AC': df_base_beam['Answer_Contamination'].to_list(),
'Base.NS.Prediction': df_base_nucleus['Prediction'].to_list(),
'Base.NS.BLEU': df_base_nucleus['BLEU'].to_list(),
'Base.NS.SacreBLEU': df_base_nucleus['SacreBLEU'].to_list(),
'Base.NS.METEOR': df_base_nucleus['METEOR'].to_list(),
'Base.NS.ROUGE': df_base_nucleus['ROUGE'].to_list(),
'Base.NS.AC': df_base_nucleus['Answer_Contamination'].to_list(), 
'GPP.BS.Prediction': df_GPP_beam['Prediction'].to_list(),
'GPP.BS.BLEU': df_GPP_beam['BLEU'].to_list(),
'GPP.BS.SacreBLEU': df_GPP_beam['SacreBLEU'].to_list(),
'GPP.BS.METEOR': df_GPP_beam['METEOR'].to_list(),
'GPP.BS.ROUGE': df_GPP_beam['ROUGE'].to_list(),
'GPP.BS.AC': df_GPP_beam['Answer_Contamination'].to_list(),
'GPP.NS.Prediction': df_GPP_nucleus['Prediction'].to_list(),
'GPP.NS.BLEU': df_GPP_nucleus['BLEU'].to_list(),
'GPP.NS.SacreBLEU': df_GPP_nucleus['SacreBLEU'].to_list(),
'GPP.NS.METEOR': df_GPP_nucleus['METEOR'].to_list(),
'GPP.NS.ROUGE': df_GPP_nucleus['ROUGE'].to_list(),
'GPP.NS.AC': df_GPP_nucleus['Answer_Contamination'].to_list()}
df = pd.DataFrame(d)

# Calculate SacreBLEU differentials between models.
df['Base.v.GPP.Beam.SacreBLEU'] = df['Base.BS.SacreBLEU'] - df['GPP.BS.SacreBLEU']
df['Base.v.GPP.NS.SacreBLEU'] = df['Base.NS.SacreBLEU'] - df['GPP.NS.SacreBLEU']

# Calculate SacreBLEU differentials between decoding strategies.
df['Base.Beam.vs.NS.SacreBLEU'] = df['Base.BS.SacreBLEU'] - df['Base.NS.SacreBLEU']
df['GPP.Beam.vs.NS.SacreBLEU'] = df['GPP.BS.SacreBLEU'] - df['GPP.NS.SacreBLEU']

In [116]:
# Get high level set stats.
base_BS_perfect = len(df[df['Base.BS.SacreBLEU'] > 99.9])
base_NS_perfect = len(df[df['Base.NS.SacreBLEU'] > 99.9])
GPP_BS_perfect = len(df[df['GPP.BS.SacreBLEU'] > 99.9])
GPP_NS_perfect = len(df[df['GPP.NS.SacreBLEU'] > 99.9])
base_BS_best = len(df[(df['Base.BS.SacreBLEU'] > df['Base.NS.SacreBLEU']) & (df['Base.BS.SacreBLEU'] > df['GPP.BS.SacreBLEU']) & (df['Base.BS.SacreBLEU'] > df['GPP.NS.SacreBLEU'])])
base_NS_best = len(df[(df['Base.NS.SacreBLEU'] > df['Base.BS.SacreBLEU']) & (df['Base.NS.SacreBLEU'] > df['GPP.BS.SacreBLEU']) & (df['Base.NS.SacreBLEU'] > df['GPP.NS.SacreBLEU'])])
GPP_BS_best = len(df[(df['GPP.BS.SacreBLEU'] > df['Base.NS.SacreBLEU']) & (df['GPP.BS.SacreBLEU'] > df['Base.BS.SacreBLEU']) & (df['GPP.BS.SacreBLEU'] > df['GPP.NS.SacreBLEU'])])
GPP_NS_best = len(df[(df['GPP.NS.SacreBLEU'] > df['Base.NS.SacreBLEU']) & (df['GPP.NS.SacreBLEU'] > df['GPP.BS.SacreBLEU']) & (df['GPP.NS.SacreBLEU'] > df['Base.BS.SacreBLEU'])])
base_BS_worst = len(df[(df['Base.BS.SacreBLEU'] < df['Base.NS.SacreBLEU']) & (df['Base.BS.SacreBLEU'] < df['GPP.BS.SacreBLEU']) & (df['Base.BS.SacreBLEU'] < df['GPP.NS.SacreBLEU'])])
base_NS_worst = len(df[(df['Base.NS.SacreBLEU'] < df['Base.BS.SacreBLEU']) & (df['Base.NS.SacreBLEU'] < df['GPP.BS.SacreBLEU']) & (df['Base.NS.SacreBLEU'] < df['GPP.NS.SacreBLEU'])])
GPP_BS_worst = len(df[(df['GPP.BS.SacreBLEU'] < df['Base.NS.SacreBLEU']) & (df['GPP.BS.SacreBLEU'] < df['Base.BS.SacreBLEU']) & (df['GPP.BS.SacreBLEU'] < df['GPP.NS.SacreBLEU'])])
GPP_NS_worst = len(df[(df['GPP.NS.SacreBLEU'] < df['Base.NS.SacreBLEU']) & (df['GPP.NS.SacreBLEU'] < df['GPP.BS.SacreBLEU']) & (df['GPP.NS.SacreBLEU'] < df['Base.BS.SacreBLEU'])])
base_BS_AC = len(df[df['Base.BS.AC'] == True])
base_NS_AC = len(df[df['Base.NS.AC'] == True])
GPP_BS_AC = len(df[df['GPP.BS.AC'] == True])
GPP_NS_AC = len(df[df['GPP.NS.AC'] == True])

# Create DataFrame of results.
c = {'Model': ['Baseline (BS)', 'Baseline (NS)', 'GPP (BS)', 'GPP (NS)'],
     'BLEU': [0.21127372392801158, 0.18258756628554812, 0.1656587608025032, 0.1364246031118634],
     'SacreBLEU' : [21.127372392801167, 18.25875662855481, 16.56587608025032, 13.64246031118634],
     'ROUGE-L' : [0.4627507125088821, 0.44259606326080936, 0.4018497255426625, 0.3732356714376023],
     'METEOR': [0.47448171105801523, 0.4390140638504702, 0.4143196855920803, 0.37423245714266146],
     'Perfect Predictions' : [base_BS_perfect, base_NS_perfect, GPP_BS_perfect, GPP_NS_perfect],
     'Best Predictions' : [base_BS_best, base_NS_best, GPP_BS_best, GPP_NS_best],
     'Worst Predictions': [base_BS_worst, base_NS_worst, GPP_BS_worst, GPP_NS_worst],
     'Answer Contamination': [base_BS_AC, base_NS_AC, GPP_BS_AC, GPP_NS_AC]}
cf = pd.DataFrame(c)

# Convert values to percentages.
cf['Perfect Predictions'] = cf['Perfect Predictions'].div(len(df))
cf['Best Predictions'] = cf['Best Predictions'].div(len(df))
cf['Worst Predictions'] = cf['Worst Predictions'].div(len(df))
cf['Answer Contamination'] = cf['Answer Contamination'].div(len(df))

# Dispaly results
cf.set_index('Model')

Unnamed: 0_level_0,BLEU,SacreBLEU,ROUGE-L,METEOR,Perfect Predictions,Best Predictions,Worst Predictions,Answer Contamination
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Baseline (BS),0.211274,21.127372,0.462751,0.474482,0.032167,0.19527,0.112772,0.007569
Baseline (NS),0.182588,18.258757,0.442596,0.439014,0.026112,0.164995,0.14456,0.007947
GPP (BS),0.165659,16.565876,0.40185,0.41432,0.020057,0.156102,0.222517,0.13264
GPP (NS),0.136425,13.64246,0.373236,0.374232,0.011921,0.148723,0.283254,0.083822


In [92]:
# Show instances where Base (Beam) performs much better than GPP (Beam), but isn't perfect.
df[df['Base.BS.SacreBLEU'] < 80].sort_values(by = 'Base.v.GPP.Beam.SacreBLEU', ascending = False)[['Reference','Answer','Base.BS.Prediction','GPP.BS.Prediction','Base.BS.SacreBLEU','GPP.BS.SacreBLEU']].head(5)

Unnamed: 0,Reference,Answer,Base.BS.Prediction,GPP.BS.Prediction,Base.BS.SacreBLEU,GPP.BS.SacreBLEU
2773,Who has loaned the Raphael Cartoons to the mus...,Queen Elizabeth II,Who loaned the Raphael Cartoons to the museum?,Who was the Queen Elizabeth II?,79.563717,5.630401
2803,What collection does the V&A Theatre & Perform...,material about live performance,What do the V&A Theatre & Performance gallerie...,What is the UK's largest national collection o...,76.043218,4.368584
1211,How many hymns did Luther write based on the T...,two hymns,How many hymns did Luther write on the Ten Com...,What was Luther's main hymn for Christmas?,74.261411,3.349504
3569,What museum specializes in cultural history an...,Peabody Museum of Archaeology and Ethnology,What museum specializes in the cultural histor...,What is the name of the Peabody Museum of Arch...,78.254229,7.431878
1129,What did Luther do at the end of his speech?,raised his arm,What did Luther do at the end of the speech?,Luther raised his arm in the traditional salut...,74.194466,3.716499


In [93]:
# Show instances where GPP (Beam) performs much better than Base (Beam), but isn't perfect.
df[df['GPP.BS.SacreBLEU'] < 80].sort_values(by = 'Base.v.GPP.Beam.SacreBLEU', ascending = True)[['Reference','Answer','Base.BS.Prediction','GPP.BS.Prediction','Base.BS.SacreBLEU','GPP.BS.SacreBLEU']].head(5)

Unnamed: 0,Reference,Answer,Base.BS.Prediction,GPP.BS.Prediction,Base.BS.SacreBLEU,GPP.BS.SacreBLEU
3432,What compiles and reports on data about the si...,ENR,What is a trade magazine for the construction ...,What magazine compiles and reports data about ...,3.599276,73.769925
507,Who does the statue of Little Insurgent commem...,children,Who served as messengers and frontline troops ...,What did the statue of Little Insurgent commem...,4.456883,72.597953
490,How high is the highest point in Warsaw?,452.8 ft,How tall is Szczliwice?,What is the highest point in Warsaw?,6.316906,74.208848
2634,What year did Newcastle first develop its cycl...,1998,When was Newcastle's cycling strategy first de...,When did Newcastle first develop its cycling s...,10.229197,77.255059
4201,When did President Uhuru Kenyatta sign a Secur...,December 2014,When was the Security Laws Amendment Bill signed?,When did President Uhuru Kenyatta sign a Secur...,8.591317,73.488892


In [94]:
# Show instances where Base (Nucleus) performs much better than GPP (Nucleus), but isn't perfect.
df[df['Base.NS.SacreBLEU'] < 80].sort_values(by = 'Base.v.GPP.NS.SacreBLEU', ascending = False)[['Reference','Answer','Base.NS.Prediction','GPP.NS.Prediction','Base.NS.SacreBLEU','GPP.NS.SacreBLEU']].head(5)

Unnamed: 0,Reference,Answer,Base.NS.Prediction,GPP.NS.Prediction,Base.NS.SacreBLEU,GPP.NS.SacreBLEU
2080,Which courts have a duty to interpret domestic...,national courts,Who has a duty to interpret domestic law as fa...,What does Francovich v Italy say they can do?,76.321115,2.81274
2687,Which British monarch appears above the frame ...,Queen Victoria,Who appears above the frame around the arches ...,What is the Queen Victoria's title?,74.466974,3.314288
3569,What museum specializes in cultural history an...,Peabody Museum of Archaeology and Ethnology,What museum specializes in the cultural histor...,What museum does the Harvard Museum of Natural...,78.254229,7.858254
4530,Which conjecture holds that every even integer...,Goldbach's conjecture,What claims that every even integer n greater ...,What does Goldbach's conjecture assert?,71.183293,0.79232
1211,How many hymns did Luther write based on the T...,two hymns,How many hymns did Luther write on the Ten Com...,What was the name of Luther's main hymn?,74.261411,4.062583


In [95]:
# Show instances where GPP (Nucleus) performs much better than Base (Nucleus), but isn't perfect.
df[df['GPP.NS.SacreBLEU'] < 80].sort_values(by = 'Base.v.GPP.NS.SacreBLEU', ascending = True)[['Reference','Answer','Base.NS.Prediction','GPP.NS.Prediction','Base.NS.SacreBLEU','GPP.NS.SacreBLEU']].head(5)

Unnamed: 0,Reference,Answer,Base.NS.Prediction,GPP.NS.Prediction,Base.NS.SacreBLEU,GPP.NS.SacreBLEU
3433,In what year did ENR compile data in nine mark...,2014,When was ENR compiled?,In what year did ENR compile data on nine mark...,3.1326,73.488892
3432,What compiles and reports on data about the si...,ENR,What is the trade magazine for the constructio...,What magazine compiles and reports data about ...,3.599276,73.769925
62,What is the name of the stadium in Miami that ...,Sun Life Stadium,What stadium was Miami's?,What is the name of the stadium in Miami?,3.050026,66.940483
2880,When did ABC's New York flagship stations chan...,"May 1, 1953","When did ABC change their callsigns to WABC, W...",When did ABC's New York flagship stations chan...,9.23843,72.925717
4639,What city has the largest inland port in Europe?,Duisburg,Where is the Rhine a hub?,What is the largest inland port in Europe?,5.087641,67.168774


In [96]:
# Show instances where Base (Beam) performs much better than Base (Nucleus), but isn't perfect.
df[df['Base.BS.SacreBLEU'] < 80].sort_values(by = 'Base.Beam.vs.NS.SacreBLEU', ascending = False)[['Reference','Answer','Base.BS.Prediction','Base.NS.Prediction','Base.BS.SacreBLEU','Base.NS.SacreBLEU']].head(5)

Unnamed: 0,Reference,Answer,Base.BS.Prediction,Base.NS.Prediction,Base.BS.SacreBLEU,Base.NS.SacreBLEU
4821,What did Olivier Roy state underwent a remarka...,Sunni pan-Islamism,What religion underwent a remarkable shift in ...,What movement was eclipsed by the Salafi movem...,70.954611,2.308317
5000,What was the capital of the Ottoman empire?,Istanbul,What was the capital of the Ottoman Empire?,What city was Ottoman Empire's capital?,75.062385,7.379782
4960,What happened to the East India Trading Compan...,exploitation,What happened to the East India Company in 1767?,What was the result of political activity?,71.086679,4.995139
4221,What is grown in the fertile highlands?,"Tea, coffee, sisal, pyrethrum, corn, and wheat",What are grown in the fertile highlands?,What are the most successful agricultural prod...,70.710678,5.300157
1096,What was Luther's Disputation of Martin Luther...,The Ninety-Five Theses,"What was Luther's ""Disputation of Martin Luthe...",What was the name of the document that Luther ...,70.858764,6.291586


In [97]:
# Show instances where Base (Nucleus) performs much better than Base (Beam), but isn't perfect.
df[df['Base.NS.SacreBLEU'] < 80].sort_values(by = 'Base.Beam.vs.NS.SacreBLEU', ascending = True)[['Reference','Answer','Base.BS.Prediction','Base.NS.Prediction','Base.BS.SacreBLEU','Base.NS.SacreBLEU']].head(5)

Unnamed: 0,Reference,Answer,Base.BS.Prediction,Base.NS.Prediction,Base.BS.SacreBLEU,Base.NS.SacreBLEU
2770,Which material is the Gloucester Candlestick m...,gilt bronze,What is one of the rarest items in the collect...,What is the Gloucester Candlestick made from?,4.932352,74.208848
1,Which NFL team won Super Bowl 50?,Denver Broncos,Who was the AFC champion?,Which team won Super Bowl 50?,5.815868,72.895452
1505,What treaty ended the Wars of Religion?,the Edict of Nantes,"What gave the Huguenots substantial religious,...",What law ended the Wars of Religion?,4.456883,70.710678
1243,What book did Luther read in 1542?,Qur'an,What was the Latin translation of?,What did Luther read in 1542?,6.770186,72.895452
12,What team was the NFC champion?,Carolina Panthers,Who did the Denver Broncos defeat in Super Bow...,Who was the NFC champion?,4.456883,64.318702


In [98]:
# Show instances where GPP (Beam) performs much better than GPP (Nucleus), but isn't perfect.
df[df['GPP.BS.SacreBLEU'] < 80].sort_values(by = 'GPP.Beam.vs.NS.SacreBLEU', ascending = False)[['Reference','Answer','GPP.BS.Prediction','GPP.NS.Prediction','GPP.BS.SacreBLEU','GPP.NS.SacreBLEU']].head(5)

Unnamed: 0,Reference,Answer,GPP.BS.Prediction,GPP.NS.Prediction,GPP.BS.SacreBLEU,GPP.NS.SacreBLEU
4530,Which conjecture holds that every even integer...,Goldbach's conjecture,What conjecture asserts that every even intege...,What does Goldbach's conjecture assert?,72.003913,0.79232
507,Who does the statue of Little Insurgent commem...,children,What did the statue of Little Insurgent commem...,What was the name of the child in the memorial?,72.597953,4.932352
3444,"Even in large firms, architects, interior desi...",entirely separate companies,"In the past, architects, interior designers, e...",What was the past trend in integrating the dif...,65.759095,1.040711
116,How many total touchdowns did Cam Newton score?,45,How many total touchdowns did Cam Newton have?,What was the career-high for total touchdowns?,75.062385,10.786826
36,How many fumbles did Von Miller force?,two,How many fumbles did Von Miller have?,What was Von Miller's first sack?,70.710678,6.770186


In [114]:
# Show instances where GPP (Nucleus) performs much better than GPP (Beam), but isn't perfect.
df[df['GPP.NS.SacreBLEU'] < 80].sort_values(by = 'GPP.Beam.vs.NS.SacreBLEU', ascending = True)[['Reference','Answer','GPP.BS.Prediction','GPP.NS.Prediction','GPP.BS.SacreBLEU','GPP.NS.SacreBLEU']].head(5)

Unnamed: 0,Reference,Answer,GPP.BS.Prediction,GPP.NS.Prediction,GPP.BS.SacreBLEU,GPP.NS.SacreBLEU
2880,When did ABC's New York flagship stations chan...,"May 1, 1953",On what date did ABC change their callsigns to...,When did ABC's New York flagship stations chan...,6.250382,72.925717
2770,Which material is the Gloucester Candlestick m...,gilt bronze,What is one of the rarest items in the collect...,What type of material is the Gloucester Candle...,4.932352,67.865027
3541,In what year did Harvard President Joseph Will...,1804,When did the Hollis Professor of Divinity Davi...,In what year did Harvard President Willard die?,8.295194,66.904844
1068,What did Luther think was the only source of k...,Bible,What did Luther teach that salvation and etern...,What was the only source of knowledge of God?,9.782376,66.940483
4107,When was Zhu Shijie born?,1249,When did Zhu Shijie solve simultaneous equatio...,What year was Zhu Shijie born?,7.347053,61.478815


In [112]:
# Show instances where all models performed well.
threshold = 95
df[(df['Base.BS.SacreBLEU'] > threshold) & (df['Base.NS.SacreBLEU'] > threshold) & (df['GPP.BS.SacreBLEU'] > threshold) & (df['GPP.NS.SacreBLEU'] > threshold)][['Reference','Answer','Context','Base.BS.Prediction','Base.NS.Prediction','GPP.BS.Prediction','GPP.NS.Prediction']].head(5)

Unnamed: 0,Reference,Answer,Context,Base.BS.Prediction,Base.NS.Prediction,GPP.BS.Prediction,GPP.NS.Prediction
448,What is the oldest exhibition site in Warsaw?,Zachęta National Gallery of Art,The 17th century Royal Ujazdów Castle currentl...,What is the oldest exhibition site in Warsaw?,What is the oldest exhibition site in Warsaw?,What is the oldest exhibition site in Warsaw?,What is the oldest exhibition site in Warsaw?
519,Who was the first Warsaw President?,Jan Andrzej Menich,The mayor of Warsaw is called President. Gener...,Who was the first Warsaw President?,Who was the first Warsaw President?,Who was the first Warsaw President?,Who was the first Warsaw President?
658,What did Tesla Electric Light & Manufacturing do?,installed electrical arc light based illuminat...,After leaving Edison's company Tesla partnered...,What did Tesla Electric Light & Manufacturing do?,What did Tesla Electric Light & Manufacturing do?,What did Tesla Electric Light & Manufacturing do?,What did Tesla Electric Light & Manufacturing do?
800,How tall was Tesla?,6 feet 2 inches,Tesla was 6 feet 2 inches (1.88 m) tall and we...,How tall was Tesla?,How tall was Tesla?,How tall was Tesla?,How tall was Tesla?
1373,What is the central business district of San D...,Downtown San Diego,Downtown San Diego is the central business dis...,What is the central business district of San D...,What is the central business district of San D...,What is the central business district of San D...,What is the central business district of San D...


In [111]:
# Show instances where no models performed well.
threshold = 5
df[(df['Base.BS.SacreBLEU'] < threshold) & (df['Base.NS.SacreBLEU'] < threshold) & (df['GPP.BS.SacreBLEU'] < threshold) & (df['GPP.NS.SacreBLEU'] < threshold)][['Reference','Answer','Context','Base.BS.Prediction','Base.NS.Prediction','GPP.BS.Prediction','GPP.NS.Prediction']].head(5)

Unnamed: 0,Reference,Answer,Context,Base.BS.Prediction,Base.NS.Prediction,GPP.BS.Prediction,GPP.NS.Prediction
26,Which Carolina Panthers team member was picked...,Cam Newton,The Panthers finished the regular season with ...,Who was named the NFL Most Valuable Player?,Who was named the NFL Most Valuable Player?,Who was named the NFL Most Valuable Player?,What quarterback was named the NFL Most Valuab...
53,Who is the Commissioner of the National Footba...,Roger Goodell,"In early 2012, NFL Commissioner Roger Goodell ...",Who stated that the 50th Super Bowl would be a...,"Who stated that the 50th Super Bowl would be ""...","Who stated that the 50th Super Bowl was ""spect...","Who stated that the 50th Super Bowl would be ""..."
57,What venue in Miami was a candidate for the si...,Sun Life Stadium,The league eventually narrowed the bids to thr...,What stadium did Miami have?,What stadium was Miami's bid for?,What was the name of Miami's Sun Life Stadium?,What stadium was the league's bid to the three...
61,"One of the sites, Merceds-Benz Superdome, is l...",New Orleans,The league eventually narrowed the bids to thr...,Where was the Mercedes-Benz Superdome located?,Where was the Mercedes-Benz Superdome located?,Where did the league narrow its bids to three ...,What city did the league narrow its bids to th...
77,Who decided not to approve paying for renovati...,Florida legislature,"The league announced on October 16, 2012, that...",Who refused to approve the funding plan to pay...,Who refused to approve the funding plan for th...,What did the Florida legislature refuse to app...,What did the Florida legislature refuse to app...


In [190]:
# Show examples of answer contamination in GPP beam.
df[df['GPP.BS.AC'] == True][['Reference','Answer','Context','GPP.BS.Prediction','GPP.BS.SacreBLEU','GPP.BS.METEOR','GPP.BS.ROUGE']].sort_values(by='GPP.BS.SacreBLEU', ascending = False).head(5)

Unnamed: 0,Reference,Answer,Context,GPP.BS.Prediction,GPP.BS.SacreBLEU,GPP.BS.METEOR,GPP.BS.ROUGE
4853,The Brotherhood was the only opposition group ...,field candidates,"Despite periodic repression, the Brotherhood h...",What was the only opposition group in Egypt ab...,60.578244,0.812586,0.758621
922,What are two complexity classes between L and P?,NL and NC,"Similarly, it is not known if L (the set of al...",What are two complexity classes between NL and...,55.552381,0.778906,0.777778
615,What did Tesla's teachers believe he was doing...,cheating,"In 1870, Tesla moved to Karlovac, to attend sc...",What did Tesla's teachers believe he was cheat...,53.784549,0.704773,0.761905
1491,What was the name of the eastern half of the c...,New South Wales,After the founding of the colony of New South ...,What was the eastern half of the colony of New...,53.553606,0.709712,0.72
2556,Newcastle was one of the first cities in the w...,electric lighting,"In the 19th century, shipbuilding and heavy en...",What was one of the first cities in the world ...,52.417058,0.778186,0.645161


In [130]:
# Using same data split code as the training file.
def parse_data(t_split='train', trim_train_pct = 0.25):

  # Split handling - validation set further split into 50% dev/test.
  if t_split == 'train':
    t_df = pd.DataFrame(load_dataset('squad')['train'])
    df = t_df.sample(frac=trim_train_pct,random_state=266)                                                 
  elif t_split in ['val','test']:
    vt_df = pd.DataFrame(load_dataset('squad')['validation'])
    df_val = vt_df.sample(frac=0.5,random_state=266)
    if t_split == 'test':
      df_test = vt_df.drop(df_val.index)
      df = df_test
    else:
      df = df_val
  else:
    raise Exception("Invalid choice of dataset split.")
  

  df['answer_text'] = df['answers'].apply(lambda x: x['text'][0])
  df['source'] = 'answer: ' + df['answer_text'] + ' context: ' + df['context'] + '</s>'
  df['target'] = df['question']

  return df 

# Get splits
train_df = parse_data('train', 0.25)
val_df = parse_data('val', 0.25)
test_df = parse_data('test', 0.25)

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [184]:
# Get tokenizer
tokenizer = T5TokenizerFast.from_pretrained('t5-base')

# Gather token lengths.
train_df['token_count'] = train_df.source.map(lambda x: len(tokenizer(x).tokens()))
val_df['token_count'] = val_df.source.map(lambda x: len(tokenizer(x).tokens()))
test_df['token_count'] = test_df.source.map(lambda x: len(tokenizer(x).tokens()))

Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors


In [186]:
train_hist = alt.Chart(train_df[['token_count']], title = 'Frequency of Full Input Token Lengths in Training Set').mark_bar().encode(
    x = alt.X('token_count:Q', bin = alt.Bin(extent = [0, 1024], step = 64), title = 'Number of Tokens'),
    y = alt.Y('count()', title = 'Number of Examples')
)
val_hist = alt.Chart(val_df[['token_count']], title = 'Frequency of Full Input Token Lengths in Validation Set').mark_bar().encode(
    x = alt.X('token_count:Q', bin = alt.Bin(extent = [0, 1024], step = 64), title = 'Number of Tokens'),
    y = alt.Y('count()', title = 'Number of Examples')
)
test_hist = alt.Chart(test_df[['token_count']], title = 'Frequency of Full Input Token Lengths in Test Set').mark_bar().encode(
    x = alt.X('token_count:Q', bin = alt.Bin(extent = [0, 1024], step = 64), title = 'Number of Tokens'),
    y = alt.Y('count()', title = 'Number of Examples')
)

train_hist | val_hist | test_hist

In [233]:
# Baseline (Beam) scores vs GPP (Beam)
chart_BLEU = alt.Chart(pd.DataFrame({'base' : df['Base.BS.SacreBLEU'], 'GPP' : df['GPP.BS.SacreBLEU']}), title = 'Baseline vs. GPP (Beam Search) BLEU Scores').mark_circle(size=20).encode(
    x = alt.X('base', title = 'Basline BLEU', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('GPP', title = 'GPP BLEU', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#648FFF')
)
chart_ROUGE = alt.Chart(pd.DataFrame({'base' : df['Base.BS.ROUGE'] * 100, 'GPP' : df['GPP.BS.ROUGE'] * 100}), title = 'Baseline vs. GPP (Beam Search) ROUGE Scores').mark_circle(size=20).encode(
    x = alt.X('base', title = 'Basline ROUGE', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('GPP', title = 'GPP ROUGE', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#DC267F')
)
chart_METEOR = alt.Chart(pd.DataFrame({'base' : df['Base.BS.METEOR'] * 100 , 'GPP' : df['GPP.BS.METEOR'] * 100}), title = 'Baseline vs. GPP (Beam Search) METEOR Scores').mark_circle(size=20).encode(
    x = alt.X('base', title = 'Basline METEOR', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('GPP', title = 'GPP METEOR', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#FFB000')
)

chart_BLEU | chart_ROUGE | chart_METEOR

In [242]:
# Baseline (Nucleus) scores vs GPP (Nucleus)
chart_BLEU = alt.Chart(pd.DataFrame({'base' : df['Base.NS.SacreBLEU'], 'GPP' : df['GPP.NS.SacreBLEU']}), title = 'Baseline vs. GPP (Nucleus Search) BLEU Scores').mark_circle(size=20).encode(
    x = alt.X('base', title = 'Basline BLEU', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('GPP', title = 'GPP BLEU', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#648FFF')
)
chart_ROUGE = alt.Chart(pd.DataFrame({'base' : df['Base.NS.ROUGE'] * 100, 'GPP' : df['GPP.NS.ROUGE'] * 100}), title = 'Baseline vs. GPP (Nucleus Search) ROUGE Scores').mark_circle(size=20).encode(
    x = alt.X('base', title = 'Basline ROUGE', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('GPP', title = 'GPP ROUGE', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#DC267F')
)
chart_METEOR = alt.Chart(pd.DataFrame({'base' : df['Base.NS.METEOR'] * 100 , 'GPP' : df['GPP.NS.METEOR'] * 100}), title = 'Baseline vs. GPP (Nucleus Search) METEOR Scores').mark_circle(size=20).encode(
    x = alt.X('base', title = 'Basline METEOR', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('GPP', title = 'GPP METEOR', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#FFB000')
)

chart_BLEU | chart_ROUGE | chart_METEOR

In [243]:
# Baseline (Beam) scores vs Baseline (Nucleus)
chart_BLEU = alt.Chart(pd.DataFrame({'beam' : df['Base.BS.SacreBLEU'], 'nucleus' : df['Base.NS.SacreBLEU']}), title = 'Baseline (Beam Search) vs. Baseline (Nucleus Search) BLEU Scores').mark_circle(size=20).encode(
    x = alt.X('beam', title = 'Beam Search BLEU', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('nucleus', title = 'Nucleus Search BLEU', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#648FFF')
)
chart_ROUGE = alt.Chart(pd.DataFrame({'beam' : df['Base.BS.ROUGE'] * 100, 'nucleus' : df['Base.NS.ROUGE'] * 100}), title = 'Baseline (Beam Search) vs. Baseline (Nucleus Search) ROUGE Scores').mark_circle(size=20).encode(
    x = alt.X('beam', title = 'Beam Search ROUGE', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('nucleus', title = 'Nucleus Search ROUGE', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#DC267F')
)
chart_METEOR = alt.Chart(pd.DataFrame({'beam' : df['Base.BS.METEOR'] * 100 , 'nucleus' : df['Base.NS.METEOR'] * 100}), title = 'Baseline (Beam Search) vs. Baseline (Nucleus Search) METEOR Scores').mark_circle(size=20).encode(
    x = alt.X('beam', title = 'Beam Search METEOR', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('nucleus', title = 'Nucleus Search METEOR', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#FFB000')
)

chart_BLEU | chart_ROUGE | chart_METEOR

In [244]:
# GPP (Beam) scores vs GPP (Nucleus)
chart_BLEU = alt.Chart(pd.DataFrame({'beam' : df['GPP.BS.SacreBLEU'], 'nucleus' : df['GPP.NS.SacreBLEU']}), title = 'GPP (Beam Search) vs. GPP (Nucleus Search) BLEU Scores').mark_circle(size=20).encode(
    x = alt.X('beam', title = 'Beam Search BLEU', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('nucleus', title = 'Nucleus Search BLEU', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#648FFF')
)
chart_ROUGE = alt.Chart(pd.DataFrame({'beam' : df['GPP.BS.ROUGE'] * 100, 'nucleus' : df['GPP.NS.ROUGE'] * 100}), title = 'GPP (Beam Search) vs. GPP (Nucleus Search) ROUGE Scores').mark_circle(size=20).encode(
    x = alt.X('beam', title = 'Beam Search ROUGE', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('nucleus', title = 'Nucleus Search ROUGE', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#DC267F')
)
chart_METEOR = alt.Chart(pd.DataFrame({'beam' : df['GPP.BS.METEOR'] * 100 , 'nucleus' : df['GPP.NS.METEOR'] * 100}), title = 'GPP (Beam Search) vs. GPP (Nucleus Search) METEOR Scores').mark_circle(size=20).encode(
    x = alt.X('beam', title = 'Beam Search METEOR', scale = alt.Scale(domain=[0,100])),
    y = alt.Y('nucleus', title = 'Nucleus Search METEOR', scale = alt.Scale(domain=[0,100])),
    color = alt.value('#FFB000')
)

chart_BLEU | chart_ROUGE | chart_METEOR

In [288]:
# Training loss viz
tl_df = pd.DataFrame({'Checkpoint': ['ckpt2100','ckpt2400','ckpt2700','ckpt3000','ckpt3300','ckpt3600','ckpt3900','ckpt4200','ckpt4500','ckpt4800','ckpt5100','ckpt5400','ckpt5700','ckpt6000','ckpt6300','ckpt6600','ckpt6900','ckpt7200','ckpt7500','ckpt7800','ckpt8100'],
 'BLEU': [1.397566,3.074543,4.445089,4.693644,5.807746,6.212545,6.825512,8.399519,12.464174,12.213876,15.031967,13.711,13.818584,13.803322,15.013156,15.177581,13.876068,15.842366,11.651171,17.36043,16.924577],
 'METEOR': [0.163511,0.190604,0.21145,0.246232,0.266645,0.291941,0.341923,0.359361,0.387946,0.391901,0.393124,0.407105,0.400393,0.394128,0.396988,0.393732,0.382578,0.399679,0.337545,0.42387,0.413562],
 'ROUGE-L': [0.167276,0.209391,0.225277,0.223424,0.240419,0.254815,0.288655,0.308419,0.361046,0.360301,0.377342,0.377948,0.376598,0.372487,0.381851,0.379609,0.366231,0.388277,0.337639,0.409743,0.403352],
 'Loss': [2.06844639778137,2.01282548904418,1.79218435287475,1.59624195098876,1.27731812000274,1.3270890712738,0.948004305362701,1.02327954769134,0.736889660358428,0.678290545940399,0.752719819545745,0.649805963039398,0.626366317272186,0.632929623126983,0.622269809246063,0.625878453254699,0.641974627971649,0.579615056514739,0.906293988227844,0.578315675258636,0.631578862667083]})

# Transform scale
tl_df['METEOR'] = tl_df['METEOR'] * 100
tl_df['ROUGE-L'] = tl_df['ROUGE-L'] * 100
tl_df['Checkpoint'] = tl_df['Checkpoint'].map(lambda x: x[-4:])

tl_df_metrics = tl_df[['Checkpoint','BLEU','METEOR','ROUGE-L']].melt(id_vars=['Checkpoint'])
tl_df_loss = tl_df[['Checkpoint','Loss']].melt(id_vars=['Checkpoint'])

metrics_chart = alt.Chart(tl_df_metrics, title = 'Evaluation Metrics vs. Loss by Training Checkpoint').mark_line().encode(
    x = alt.X('Checkpoint'),
    y = alt.Y('value', title = 'Metric Score'),
    color = alt.Color('variable', scale=alt.Scale(range = ['#648FFF','#DC267F','#FFB000', 'black']),
    sort = ['BLEU','METEOR','ROUGE-L','Loss'])
)

loss_chart = alt.Chart(tl_df_loss).mark_line().encode(
    x = alt.X('Checkpoint'),
    y = alt.Y('value', title = 'Loss'),
    color = alt.Color('variable', title = 'Metric', scale=alt.Scale(range = ['#648FFF','#DC267F','#FFB000', 'black']),
    sort = ['BLEU','METEOR','ROUGE-L','Loss'])
)

(metrics_chart + loss_chart).resolve_scale(
    y = 'independent'
)