In [70]:
import os, dill
import json
from scipy import stats
import matplotlib.pyplot as plt
import sys
import glob
import numpy as np
import torch
import torch.nn as nn
import math

sys.path.append('..')

import ptm_recommender.graph_models.graph_util as util
from ptm_recommender.graph_models.gcnn.gin_utils import graph_to_s2vgraphs

torch.set_num_threads(2)
torch.cuda.is_available = lambda : False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


DILL_PATH = '../ptm_model_bench/base_model_dataset/'
MODEL_PATH = '../results/gcnn_ptm_model_bench/'
PERFORMANCE_PATH = '../ptm_model_bench/performance/seen_base_models/'
OUTPUT_NAME = 'result'

## Recomendation on testsets

In [71]:
## Basic setup
task_to_metric = {
            "mrpc" : "eval_accuracy",  "cola" : "eval_matthews_correlation", #"cola" : "eval_accuracy",
            "rte" :'eval_accuracy', "sst2" :'eval_accuracy', "stsb" :'eval_pearson',
            "wnli" :'eval_accuracy', "squad_v2": "f1", "mnli": "eval_accuracy", 
            "qnli": "eval_accuracy", "qqp": "eval_accuracy"}

base_model_list = [
    "albert-base-v2", 
    "albert-large-v2",
    "bert-base-uncased", 
    "bert-large-uncased", 
    "distilbert-base-uncased",
    "distilroberta-base", 
    "electra-base-discriminator",
    "electra-large-discriminator",
    "roberta-base", 
    "roberta-large",
    "xlm-roberta-base", 
    "xlm-roberta-large"
]
# Task for title
task_title = {
    "cola":'CoLA', 
    "mrpc":'MRPC',
    "rte" :'RTE', 
    "sst2" :'SST2', 
    "stsb" :'STSB',
    "wnli" :'WNLI', 
    "squad_v2": 'SQuADv2',
    "mnli": 'MNLI', 
    "qnli": 'QNLI', 
    "qqp": 'QQP'
}


In [119]:
tasks = list(task_title.keys())
tasks = ['cola', 'mnli', 'mrpc', 'qnli', 'rte', 'stsb', 'sst2', 'qqp']
output_name = 'result'
performance_results_base_path='../ptm_model_bench/performance/merged_models/'
import dill, torch
import os,json
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
from matplotlib import gridspec
import math

def get_perf_and_logme(task):
    model_results_base_path = f'../results/gcnn_ptm_model_bench/'  ## 이부분만 제가 쓰는 폴더
    with  open(os.path.join(model_results_base_path,'{}_{}.dill'.format(output_name ,task)),'rb') as f:
        model_result = dill.load(f)
    with  open(os.path.join(performance_results_base_path, '{}_performance_score.json'.format(task)), 'r') as f:
        performance = json.load(f)
    try:
        with  open(os.path.join(performance_results_base_path, '{}_logme.json'.format(task)), 'r') as f:
            logme = json.load(f) 
    except:
        logme=None
    return model_result, performance, logme

In [129]:
tasks = list(task_title.keys())
tasks = ['cola', 'mnli', 'mrpc', 'qnli', 'rte', 'stsb', 'sst2', 'qqp']
for tdx, task in enumerate(tasks):
    if task =='wnli' or task == 'squad_v2':
        continue
    model_result, performance, logme = get_perf_and_logme(task)
    for x in model_result['total_result']:
        x['logme'] = logme[x['model_name']]
    perf_list =model_result['total_result']
    logme_sorted = sorted(perf_list, key=lambda d:-d['logme'])
    pred_acc_sorted = sorted(perf_list, key=lambda d:-d['pred_accuracy'])
    true_acc_sorted = sorted(perf_list, key=lambda d:-d['true_accuracy'])
    logme_selected = logme_sorted[0]['model_name']
    gnn_based_selected = pred_acc_sorted[0]['model_name']

    for idx, x in enumerate(true_acc_sorted):
        if x['model_name'] ==  logme_selected:
            logme_rank = (idx+1)/len(true_acc_sorted)
        if x['model_name'] ==  gnn_based_selected:
            ptm_recommender_rank = (idx+1)/len(true_acc_sorted)
    print(task, ": logme {:.2f}%, ptm_recommender {:.2f}%".format(logme_rank*100,  ptm_recommender_rank*100))

task = 'squad_v2'
model_result, performance, logme = get_perf_and_logme(task)
perf_list =model_result['total_result']
pred_acc_sorted = sorted(perf_list, key=lambda d:-d['pred_accuracy'])
true_acc_sorted = sorted(perf_list, key=lambda d:-d['true_accuracy'])
gnn_based_selected = pred_acc_sorted[0]['model_name']

for idx, x in enumerate(true_acc_sorted):
    if x['model_name'] ==  gnn_based_selected:
        ptm_recommender_rank = (idx+1)/len(true_acc_sorted)
print('squad_v2', ": logme {}, ptm_recommender {:.2f}%".format('-',  ptm_recommender_rank*100))


cola : logme 1.04%, ptm_recommender 2.08%
mnli : logme 10.19%, ptm_recommender 40.74%
mrpc : logme 7.41%, ptm_recommender 11.11%
qnli : logme 13.89%, ptm_recommender 0.93%
rte : logme 4.63%, ptm_recommender 20.37%
stsb : logme 43.52%, ptm_recommender 4.63%
sst2 : logme 12.96%, ptm_recommender 12.04%
qqp : logme 8.33%, ptm_recommender 7.41%
squad_v2 : logme -, ptm_recommender 1.85%


## Recomendation on seen base 

In [132]:
def transform_to_s2graph(base_model, performance, feature_dim ):
    graph_info = [base_model]
    graph_info_graphs = [x[0] for x in graph_info]
    graph_info_params = [x[1] for x in graph_info]
    graph_info_model_nm = [x[2] for x in graph_info]
    for (G, p) in zip(graph_info_graphs, graph_info_params):
        G.graph['feat_dim'] = feature_dim
        param_keys = p.keys()
        for u in util.node_iter(G):
            label = util.node_dict(G)[u]['label']
            if label in param_keys:
                feature = p[label]
                if feature.shape[0] != feature_dim:
                    assert feature.shape[0] == feature_dim
            else:
                feature = torch.zeros(feature_dim).squeeze()
            util.node_dict(G)[u]['feat'] = feature.float() 
    test_accuracies = [performance[nm][task_to_metric[task]] for nm in graph_info_model_nm if nm in performance.keys()]
    test_graphs = graph_to_s2vgraphs(graph_info_graphs, test_accuracies, graph_info_model_nm)
    return test_graphs

In [133]:
def load_base_model_result(task, performance, prefix = 'base_'):
    model_path = f'{MODEL_PATH}/model_{OUTPUT_NAME}_{task}.pth'
   
    model = torch.load(model_path, map_location=device)
    model.device=device
    
    list_of_models = glob.glob("../ptm_model_bench/base_model_dataset/*")
    feature_dim = 128
    result = {'total_result' : []}
    for idx, model_path in enumerate(list_of_models):
        
        model_type =  model_path.split('/')[-1]
        if model_type.startswith(prefix) is False:
            continue
        with open(model_path, 'rb') as f:
            base_model_dataset=dill.load(f)
        
        for base_model in base_model_dataset:
            if base_model[2] not in performance.keys():
                continue
            test_graphs = transform_to_s2graph(base_model, performance, feature_dim)
            y_pred = model(test_graphs)
        model_nm = base_model_dataset[0][2]
        true_accuracy = performance[model_nm][task_to_metric[task]] if task!='squad_v2' \
                        else performance[model_nm][task_to_metric[task]]/100
        result['total_result'].append({'true_accuracy': true_accuracy,
                             'pred_accuracy': y_pred.view(-1).detach().cpu().numpy()[0],
                             'model_name': base_model_dataset[0][2]})
    return result, test_graphs

In [134]:
DILL_PATH = '../ptm_model_bench/base_model_dataset/'
MODEL_PATH = '../results/gcnn_ptm_model_bench/'
PERFORMANCE_PATH = '../ptm_model_bench/performance/seen_base_models/'
OUTPUT_NAME = 'result'
for tdx, task in enumerate(tasks):
    if task =='wnli' or task == 'squad_v2':
        continue
    with  open( f'{PERFORMANCE_PATH}/base_{task}_performance_score.json', 'r') as f:
        performance = json.load(f)
        performance = performance[task]
    if task != 'squad_v2':
        with  open( f'{PERFORMANCE_PATH}/base_{task}_logme.json', 'r') as f:
            logme = json.load(f)
    model_result, test_graph = load_base_model_result(task, performance)
    for x in model_result['total_result']:
        x['logme'] = logme[x['model_name']]
    perf_list =model_result['total_result']
    logme_sorted = sorted(perf_list, key=lambda d:-d['logme'])
    pred_acc_sorted = sorted(perf_list, key=lambda d:-d['pred_accuracy'])
    true_acc_sorted = sorted(perf_list, key=lambda d:-d['true_accuracy'])
    logme_selected = logme_sorted[0]['model_name']
    gnn_based_selected = pred_acc_sorted[0]['model_name']
    for idx, x in enumerate(true_acc_sorted):
        if x['model_name'] ==  logme_selected:
            logme_rank = (idx+1)/len(true_acc_sorted)
        if x['model_name'] ==  gnn_based_selected:
            ptm_recommender_rank = (idx+1)/len(true_acc_sorted)
    print(task, ": logme {:.2f}%, ptm_recommender {:.2f}%".format(logme_rank*100,  ptm_recommender_rank*100))

task = 'squad_v2'
with  open( f'{PERFORMANCE_PATH}/base_{task}_performance_score.json', 'r') as f:
    performance = json.load(f)
    performance = performance[task]
if task != 'squad_v2':
    with  open( f'{PERFORMANCE_PATH}/base_{task}_logme.json', 'r') as f:
        logme = json.load(f)
model_result, test_graph = load_base_model_result(task, performance)
perf_list =model_result['total_result']
pred_acc_sorted = sorted(perf_list, key=lambda d:-d['pred_accuracy'])
true_acc_sorted = sorted(perf_list, key=lambda d:-d['true_accuracy'])
gnn_based_selected = pred_acc_sorted[0]['model_name']

for idx, x in enumerate(true_acc_sorted):
    if x['model_name'] ==  gnn_based_selected:
        ptm_recommender_rank = (idx+1)/len(true_acc_sorted)
print(task, ": logme {}, ptm_recommender {:.2f}%".format('-',  ptm_recommender_rank*100))

cola : logme 8.33%, ptm_recommender 8.33%
mnli : logme 8.33%, ptm_recommender 16.67%
mrpc : logme 58.33%, ptm_recommender 16.67%
qnli : logme 50.00%, ptm_recommender 8.33%
rte : logme 58.33%, ptm_recommender 8.33%
stsb : logme 50.00%, ptm_recommender 16.67%
sst2 : logme 66.67%, ptm_recommender 8.33%
qqp : logme 50.00%, ptm_recommender 8.33%
squad_v2 : logme -, ptm_recommender 8.33%


## Recomendation on unseen base 

In [135]:
interested_models =[ 'bert-base-cased',
                    'bert-large-cased',
                    'bert-base-multilingual-uncased',
                    'bert-base-multilingual-cased',
                    'bert-large-uncased-whole-word-masking',
                    'bert-large-cased-whole-word-masking',
                    'roberta-base-openai-detector',
                    'roberta-large-openai-detector',
                    'distilbert-base-cased',
                    'distilbert-base-multilingual-cased'
                 ]
PERFORMANCE_PATH = '../ptm_model_bench/performance/unseen_base_models/'
for tdx, task in enumerate(tasks):
    if task =='wnli' or task == 'squad_v2':
        continue
    with  open( f'{PERFORMANCE_PATH}/unrelated_base_{task}_performance_score.json', 'r') as f:
        performance = json.load(f)
        performance = performance[task]
    if task != 'squad_v2':
        with  open( f'{PERFORMANCE_PATH}/unrelated_base_{task}_logme.json', 'r') as f:
            logme = json.load(f)
    model_result, test_graph = load_base_model_result(task, performance, prefix='unseen_base_')
    for x in model_result['total_result']:
        x['logme'] = logme[x['model_name']]
    perf_list =model_result['total_result']
    logme_sorted = sorted(perf_list, key=lambda d:-d['logme'])
    pred_acc_sorted = sorted(perf_list, key=lambda d:-d['pred_accuracy'])
    true_acc_sorted = sorted(perf_list, key=lambda d:-d['true_accuracy'])
    logme_selected = logme_sorted[0]['model_name']
    gnn_based_selected = pred_acc_sorted[0]['model_name']
    for idx, x in enumerate(true_acc_sorted):
        if x['model_name'] ==  logme_selected:
            logme_rank = (idx+1)/len(true_acc_sorted)
        if x['model_name'] ==  gnn_based_selected:
            ptm_recommender_rank = (idx+1)/len(true_acc_sorted)
    print(task, ": logme {:.2f}%, ptm_recommender {:.2f}%".format(logme_rank*100,  ptm_recommender_rank*100))

task = 'squad_v2'
with  open( f'{PERFORMANCE_PATH}/unrelated_base_{task}_performance_score.json', 'r') as f:
    performance = json.load(f)
    performance = performance[task]
model_result, test_graph = load_base_model_result(task, performance, prefix='unseen_base_')
perf_list =model_result['total_result']
pred_acc_sorted = sorted(perf_list, key=lambda d:-d['pred_accuracy'])
true_acc_sorted = sorted(perf_list, key=lambda d:-d['true_accuracy'])
gnn_based_selected = pred_acc_sorted[0]['model_name']

for idx, x in enumerate(true_acc_sorted):
    if x['model_name'] ==  gnn_based_selected:
        ptm_recommender_rank = (idx+1)/len(true_acc_sorted)
print(task, ": logme {}, ptm_recommender {:.2f}%".format('-',  ptm_recommender_rank*100))

cola : logme 10.00%, ptm_recommender 10.00%
mnli : logme 10.00%, ptm_recommender 10.00%
mrpc : logme 80.00%, ptm_recommender 50.00%
qnli : logme 70.00%, ptm_recommender 10.00%
rte : logme 10.00%, ptm_recommender 30.00%
stsb : logme 70.00%, ptm_recommender 20.00%
sst2 : logme 50.00%, ptm_recommender 10.00%
qqp : logme 60.00%, ptm_recommender 10.00%
squad_v2 : logme -, ptm_recommender 10.00%


In [136]:
performance

{'bert-base-cased': {'exact': 69.2411353491,
  'f1': 72.3685457206,
  'total': 11873,
  'HasAns_exact': 70.0573549258,
  'HasAns_f1': 76.3211442882,
  'HasAns_total': 5928,
  'NoAns_exact': 68.4272497897,
  'NoAns_f1': 68.4272497897,
  'NoAns_total': 5945,
  'best_exact': 69.2411353491,
  'best_exact_thresh': 0.0,
  'best_f1': 72.3685457206,
  'best_f1_thresh': 0.0},
 'bert-large-cased': {'exact': 77.2256380022,
  'f1': 80.3405868502,
  'total': 11873,
  'HasAns_exact': 77.6315789474,
  'HasAns_f1': 83.8704095265,
  'HasAns_total': 5928,
  'NoAns_exact': 76.8208578638,
  'NoAns_f1': 76.8208578638,
  'NoAns_total': 5945,
  'best_exact': 77.2256380022,
  'best_exact_thresh': 0.0,
  'best_f1': 80.3405868502,
  'best_f1_thresh': 0.0},
 'bert-base-multilingual-uncased': {'exact': 63.9012886381,
  'f1': 66.7672928832,
  'total': 11873,
  'HasAns_exact': 61.032388664,
  'HasAns_f1': 66.7726161272,
  'HasAns_total': 5928,
  'NoAns_exact': 66.7619848612,
  'NoAns_f1': 66.7619848612,
  'NoAns_to

In [137]:
model_result

{'total_result': [{'true_accuracy': 0.860995693752,
   'pred_accuracy': 0.6209824,
   'model_name': 'bert-large-cased-whole-word-masking'},
  {'true_accuracy': 0.753892736933,
   'pred_accuracy': 0.6198833,
   'model_name': 'bert-large-uncased-whole-word-masking'},
  {'true_accuracy': 0.667672928832,
   'pred_accuracy': 0.6178845,
   'model_name': 'bert-base-multilingual-uncased'},
  {'true_accuracy': 0.6721428507999999,
   'pred_accuracy': 0.59631,
   'model_name': 'distilbert-base-multilingual-cased'},
  {'true_accuracy': 0.880149829905,
   'pred_accuracy': 0.7760251,
   'model_name': 'roberta-large-openai-detector'},
  {'true_accuracy': 0.669276993168,
   'pred_accuracy': 0.5926028,
   'model_name': 'distilbert-base-cased'},
  {'true_accuracy': 0.808699004967,
   'pred_accuracy': 0.71738327,
   'model_name': 'roberta-base-openai-detector'},
  {'true_accuracy': 0.803405868502,
   'pred_accuracy': 0.6240389,
   'model_name': 'bert-large-cased'},
  {'true_accuracy': 0.723685457206,
   