In [1]:
import utils_misc
utils_misc.select_freer_gpu()

from sklearn.metrics import accuracy_score, balanced_accuracy_score
from model_baselines import NLIAnswerEquivalence, STS_AE
from model_consolidation import ConsolidationModel
from utils_nanco import load_nanco_pairs
import numpy as np, os, pandas as pd
from collections import Counter

Will use GPU: 2


In [2]:
models = [
    # Sentence Similarity
    {"model_name": "STS-BertB", "model": STS_AE(model_card="sentence-transformers/stsb-bert-base")},
    {"model_name": "STS-MPNet", "model": STS_AE(model_card="sentence-transformers/all-mpnet-base-v2")},

    # NLI
    {"model_name": "MNLI-Eonly", "model": NLIAnswerEquivalence(model_card="roberta-large-mnli", mode="eonly")},
    {"model_name": "VitC-Eonly", "model": NLIAnswerEquivalence(mode="eonly")},

    # Answer equivalence-based models
    # {"model_name": "lerc", "model": LERCScorer(archive_path="/export/home/models/lerc-2020-11-18.tar.gz")}, # Put a request on Github if you want this model
    {"model_name": "quip_reg_ae_mocha", "model": ConsolidationModel(model_card="/export/home/models/quip-hf/", model_file="/export/share/plaban/models/quip_ae_mocha_mae_0.5352.bin")},
    {"model_name": "discord_qg/mocha", "model": ConsolidationModel(model_card="Salesforce/qa_consolidation")},
]



Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /export/home/models/quip-hf/ were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS e

<All keys matched successfully>


In [3]:
def get_validation_threshold(scores, labels):
    thresholds = np.arange(0, 5.0, 0.01)
    max_acc = 0.0
    best_threshold = 0.0
    for t in thresholds:
        y_pred = [1 if s > t else 0 for s in scores]
        acc = balanced_accuracy_score(labels, y_pred)
        if acc > max_acc:
            max_acc = acc
            best_threshold = t
    return best_threshold, max_acc

def dataset_builder(ac_dataset):
    questions = [d["question"] for d in ac_dataset]
    answers1 = [d["answer1"] for d in ac_dataset]
    answers2 = [d["answer2"] for d in ac_dataset]
    contexts1 = [d["paragraph1"] for d in ac_dataset]
    contexts2 = [d["paragraph2"] for d in ac_dataset]
    labels = [d["label"] for d in ac_dataset]
    return questions, answers1, answers2, contexts1, contexts2, labels

def add_average_row(results):
    average_row = {"Question": "Average"}
    for mod in models:
        average_row[mod["model_name"]] = np.mean([r[mod["model_name"]] for r in results])
    results.append(average_row)

In [4]:
# Validation
results = []
model_thresholds = {mod["model_name"]: [] for mod in models}
for q in ["Q2", "Q3", "Q5", "Q6"]: # Validation questions
    D = {"Question": q}
    ac_dataset = load_nanco_pairs(q_includes=[q])
    questions, answers1, answers2, contexts1, contexts2, labels = dataset_builder(ac_dataset)
    
    for mod in models:
        val_scores = mod["model"].score(questions=questions, answers1=answers1, answers2=answers2, contexts1=contexts1, contexts2=contexts2)["scores"]
        validation_threshold, best_val_acc = get_validation_threshold(val_scores, labels)
        model_thresholds[mod["model_name"]].append(validation_threshold)
        D[mod["model_name"]] = best_val_acc
    results.append(D)
add_average_row(results)
pd.DataFrame(results)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,Question,STS-BertB,STS-MPNet,MNLI-Eonly,VitC-Eonly,quip_reg_ae_mocha,discord_qg/mocha
0,Q2,0.912879,0.783517,0.525317,0.541682,0.846658,0.904112
1,Q3,0.986928,0.880719,0.790033,0.798611,0.996732,0.97835
2,Q5,0.912811,0.797274,0.543675,0.615011,0.809285,0.879529
3,Q6,0.973956,0.955495,0.571429,0.590659,0.942088,0.95544
4,Average,0.946644,0.854251,0.607613,0.636491,0.898691,0.929358


In [7]:
# Threshold selection
for model_name in model_thresholds:
    print("[%s] Mean validation threshold: %.3f" % (model_name, np.mean(model_thresholds[model_name])))

[STS-BertB] Mean validation threshold: 0.488
[STS-MPNet] Mean validation threshold: 0.463
[MNLI-Eonly] Mean validation threshold: 0.385
[VitC-Eonly] Mean validation threshold: 0.058
[quip_reg_ae_mocha] Mean validation threshold: 1.330
[discord_qg/mocha] Mean validation threshold: 2.745


In [6]:
# Test
results = []
model_thresholds = {mn: np.mean(ts) for mn, ts in model_thresholds.items()}
for q in ["Q4", "Q7", "Q8", "Q9"]: # Test questions
    D = {"Question": q}
    ac_dataset = load_nanco_pairs(q_includes=[q])
    questions, answers1, answers2, contexts1, contexts2, labels = dataset_builder(ac_dataset)
    
    for mod in models:
        test_scores = mod["model"].score(questions=questions, answers1=answers1, answers2=answers2, contexts1=contexts1, contexts2=contexts2)["scores"]
        test_preds = [1 if s > model_thresholds[mod["model_name"]] else 0 for s in test_scores]
        test_bacc = balanced_accuracy_score(labels, test_preds)
        D[mod["model_name"]] = test_bacc
    results.append(D)

add_average_row(results)
pd.DataFrame(results)

Unnamed: 0,Question,STS-BertB,STS-MPNet,MNLI-Eonly,VitC-Eonly,quip_reg_ae_mocha,discord_qg/mocha
0,Q4,0.542101,0.706304,0.497609,0.512246,0.594928,0.65913
1,Q7,0.804275,0.775066,0.57449,0.578128,0.723267,0.888872
2,Q8,0.607563,0.695238,0.5,0.5,0.704762,0.752381
3,Q9,0.978528,0.743152,0.748466,0.75986,0.939641,0.951475
4,Average,0.733117,0.72994,0.580141,0.587559,0.740649,0.812965
