In [None]:
# This notebook runs the NND experiment in 
# Section 5.3 of the paper: "Model Scaling Effects"
# On the Challenge 300 Generative QA dataset,
# Three QA model families are compared:
# T5-ssm, UnifiedQA, and Macaw

In [1]:
import utils_misc, json, pandas as pd
freer_gpu = utils_misc.select_freer_gpu()

from utils_nnd_dataset import load_c300_nnd
from utils_nnd import GeneratorHF
import utils_nnd

Will use GPU: 1


In [2]:
models = [
        {"model_card": "google/t5-small-ssm-nq", "model_name": "t5-small-ssm-nq"},
        {"model_card": "google/t5-large-ssm-nq", "model_name": "t5-large-ssm-nq"},
        {"model_card": "google/t5-xl-ssm-nq", "model_name": "t5-xl-ssm-nq"},
        {"model_card": "allenai/macaw-large", "model_name": "macaw-large", "params": {"force_dec_prepend": "$answer$ = "}},
        {"model_card": "allenai/macaw-3b", "model_name": "macaw-3b", "params": {"force_dec_prepend": "$answer$ = "}},
        {"model_card": "allenai/unifiedqa-t5-small", "model_name": "unifiedqa-t5-small"},
        {"model_card": "allenai/unifiedqa-t5-base", "model_name": "unifiedqa-t5-base"},
        {"model_card": "allenai/unifiedqa-t5-large", "model_name": "unifiedqa-t5-large"},
        {"model_card": "allenai/unifiedqa-t5-3b", "model_name": "unifiedqa-t5-3b"},

        # CPU-only models
        # {"model_card": "google/t5-xxl-ssm-nq", "model_name": "t5-xxl-ssm-nq", "params": {"device": "cpu"}},
        # {"model_card": "allenai/macaw-11b", "starter_file": None, "model_name": "macaw-11b", "params": {"force_dec_prepend": "$answer$ = ", "device": "cpu"}},
        # {"model_card": "allenai/macaw-answer-11b", "starter_file": None, "model_name": "macaw-answer-11b", "params": {"force_dec_prepend": "$answer$ = ", "device": "cpu"}},
        # {"model_card": "allenai/unifiedqa-t5-11b", "model_name": "unifiedqa-t5-11b", "params": {"device": "cpu"}},
    ]

len(models)

9

In [3]:
results = []
for gen in models:
    c300_nnd = load_c300_nnd(datafolder="/export/home/data/", model_card=gen["model_card"])
    print(gen["model_name"])
    model = GeneratorHF(model_card=gen["model_card"], **gen.get("params", {}))
    result = utils_nnd.run_nnd(c300_nnd, model, gen["model_name"], no_error_label=1.0, breakdown_key="cat_group", report_type="accuracy")
    results.append(result)
    print(result)
pd.DataFrame(results)

t5-small-ssm-nq


NND t5-small-ssm-nq:   0%|          | 0/807 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
NND t5-small-ssm-nq: 100%|██████████| 807/807 [03:05<00:00,  4.34it/s]


{'model_name': 't5-small-ssm-nq', 'accuracy': 51.42503097893433, 'A_1.0': 0.5142503097893433, 'A_0.0': 0.5142503097893433, 'A_Common Sense': 0.5057915057915058, 'A_Comparison': 0.7333333333333333, 'A_Entity': 0.6808510638297872, 'A_Creativity': 0.7352941176470589, 'A_Science': 0.44041450777202074}
t5-large-ssm-nq


NND t5-large-ssm-nq:   0%|          | 0/807 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
NND t5-large-ssm-nq: 100%|██████████| 807/807 [06:51<00:00,  1.96it/s]


{'model_name': 't5-large-ssm-nq', 'accuracy': 57.372986369268894, 'A_1.0': 0.573729863692689, 'A_0.0': 0.573729863692689, 'A_Common Sense': 0.6312741312741312, 'A_Comparison': 0.26666666666666666, 'A_Entity': 0.7021276595744681, 'A_Creativity': 0.5588235294117647, 'A_Science': 0.41450777202072536}
t5-xl-ssm-nq


NND t5-xl-ssm-nq: 100%|██████████| 807/807 [09:15<00:00,  1.45it/s]


{'model_name': 't5-xl-ssm-nq', 'accuracy': 55.762081784386616, 'A_1.0': 0.5576208178438662, 'A_0.0': 0.5576208178438662, 'A_Common Sense': 0.5907335907335908, 'A_Comparison': 0.4666666666666667, 'A_Entity': 0.723404255319149, 'A_Creativity': 0.6176470588235294, 'A_Science': 0.42487046632124353}
macaw-large


NND macaw-large: 100%|██████████| 807/807 [07:34<00:00,  1.78it/s]


{'model_name': 'macaw-large', 'accuracy': 64.31226765799256, 'A_1.0': 0.6431226765799256, 'A_0.0': 0.6431226765799256, 'A_Common Sense': 0.6583011583011583, 'A_Comparison': 0.7333333333333333, 'A_Entity': 0.5957446808510638, 'A_Creativity': 0.7647058823529411, 'A_Science': 0.5854922279792746}
macaw-3b


NND macaw-3b: 100%|██████████| 807/807 [08:12<00:00,  1.64it/s]


{'model_name': 'macaw-3b', 'accuracy': 68.64931846344486, 'A_1.0': 0.6864931846344485, 'A_0.0': 0.6864931846344486, 'A_Common Sense': 0.6872586872586872, 'A_Comparison': 0.6, 'A_Entity': 0.5957446808510638, 'A_Creativity': 0.8235294117647058, 'A_Science': 0.689119170984456}
unifiedqa-t5-small


NND unifiedqa-t5-small: 100%|██████████| 807/807 [02:18<00:00,  5.82it/s]


{'model_name': 'unifiedqa-t5-small', 'accuracy': 56.25774473358116, 'A_1.0': 0.5625774473358116, 'A_0.0': 0.5625774473358116, 'A_Common Sense': 0.5231660231660231, 'A_Comparison': 0.5333333333333333, 'A_Entity': 0.574468085106383, 'A_Creativity': 0.6176470588235294, 'A_Science': 0.6580310880829016}
unifiedqa-t5-base


NND unifiedqa-t5-base: 100%|██████████| 807/807 [03:54<00:00,  3.45it/s]


{'model_name': 'unifiedqa-t5-base', 'accuracy': 59.97521685254027, 'A_1.0': 0.5997521685254027, 'A_0.0': 0.5997521685254028, 'A_Common Sense': 0.5656370656370656, 'A_Comparison': 0.4, 'A_Entity': 0.6170212765957447, 'A_Creativity': 0.6470588235294118, 'A_Science': 0.694300518134715}
unifiedqa-t5-large


NND unifiedqa-t5-large: 100%|██████████| 807/807 [07:29<00:00,  1.80it/s]


{'model_name': 'unifiedqa-t5-large', 'accuracy': 63.320941759603464, 'A_1.0': 0.6332094175960347, 'A_0.0': 0.6332094175960348, 'A_Common Sense': 0.6332046332046332, 'A_Comparison': 0.4666666666666667, 'A_Entity': 0.6808510638297872, 'A_Creativity': 0.6764705882352942, 'A_Science': 0.6269430051813472}
unifiedqa-t5-3b


NND unifiedqa-t5-3b: 100%|██████████| 807/807 [07:08<00:00,  1.89it/s]

{'model_name': 'unifiedqa-t5-3b', 'accuracy': 64.43618339529121, 'A_1.0': 0.644361833952912, 'A_0.0': 0.644361833952912, 'A_Common Sense': 0.6293436293436293, 'A_Comparison': 0.4666666666666667, 'A_Entity': 0.5957446808510638, 'A_Creativity': 0.8235294117647058, 'A_Science': 0.6787564766839378}





Unnamed: 0,model_name,accuracy,A_1.0,A_0.0,A_Common Sense,A_Comparison,A_Entity,A_Creativity,A_Science
0,t5-small-ssm-nq,51.425031,0.51425,0.51425,0.505792,0.733333,0.680851,0.735294,0.440415
1,t5-large-ssm-nq,57.372986,0.57373,0.57373,0.631274,0.266667,0.702128,0.558824,0.414508
2,t5-xl-ssm-nq,55.762082,0.557621,0.557621,0.590734,0.466667,0.723404,0.617647,0.42487
3,macaw-large,64.312268,0.643123,0.643123,0.658301,0.733333,0.595745,0.764706,0.585492
4,macaw-3b,68.649318,0.686493,0.686493,0.687259,0.6,0.595745,0.823529,0.689119
5,unifiedqa-t5-small,56.257745,0.562577,0.562577,0.523166,0.533333,0.574468,0.617647,0.658031
6,unifiedqa-t5-base,59.975217,0.599752,0.599752,0.565637,0.4,0.617021,0.647059,0.694301
7,unifiedqa-t5-large,63.320942,0.633209,0.633209,0.633205,0.466667,0.680851,0.676471,0.626943
8,unifiedqa-t5-3b,64.436183,0.644362,0.644362,0.629344,0.466667,0.595745,0.823529,0.678756
