# Test the weaving code on the base models

In [1]:
# install dependencies

! pip install -q joblib  # joblib for memoizing functions
! pip install -q ipywidgets widgetsnbextension pandas-profiling # IProgress for progress bars

[0m

In [2]:
# Add model_merging to the python path

import os
import sys

model_merging_base = os.path.abspath("../model_merging/")
# assert it exist
assert os.path.exists(model_merging_base)
if model_merging_base not in sys.path:
    sys.path.append(model_merging_base)

In [3]:
# import joblib for caching and distributed computing
from math import sqrt

from joblib import Memory, Parallel, delayed

# memory = Memory(location="cache", verbose=10)
memory = Memory(location="cache", verbose=0)

parallel = Parallel(n_jobs=2, return_as="generator")
output_generator = parallel(delayed(sqrt)(i**2) for i in range(10))

In [4]:
# Imports and cached functions

import os

from llm_weaver import (
    calculate_score_from_weaving_config,
    get_score_from_named_model,
    test_weaver,
)

# Disable parallelism in tokenizers to avoid deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

calculate_score_from_weaving_config_cached = memory.cache(
    calculate_score_from_weaving_config
)
test_weaver_cached = memory.cache(test_weaver)

get_score_from_named_model_cached = memory.cache(get_score_from_named_model)

## Make sure you can build using `.build()`

In [16]:
import transformers
from llm_weaver import get_blank_model, get_model_config

if transformers.__version__ < "4.3.1":
    raise ValueError(
        "Need transformers >= 4.3.1, or something like that. Not sure of the version."
    )
    # https://github.com/huggingface/transformers/commit/4a55e4787760fdb6c40a972a60d814ba05425da1#diff-648ec06beb5ae6380c7f611a0f513a5d392509497d245a09f06b6549358afdffR1151

print(f"You have transformers version {transformers.__version__}!")

model = get_blank_model(get_model_config("textattack/roberta-base-RTE"))
model.build()

type(model)

You have transformers version 4.35.0!


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification

## Step 0: Test weaving code

This test makes sure that our score when using the weaver to reconstruct a model from all its parts get the same evaluation score as the original model

In [15]:
model_ids = [
    "textattack/roberta-base-RTE",
    "textattack/roberta-base-MNLI",  # <--- this one has a very low score
    # "howey/roberta-large-rte",
    # "howey/roberta-large-mnli",
    # "howey/roberta-large-qnli",
    # "howey/roberta-large-sst2",
    # "howey/roberta-large-cola",
    # "howey/roberta-large-mrpc",
    # "howey/roberta-large-qqp",
    # "howey/roberta-large-stsb",
    # "JeremiahZ/roberta-base-rte",
    # "JeremiahZ/roberta-base-mnli",
    # "JeremiahZ/roberta-base-qnli",
    # "JeremiahZ/roberta-base-sst2",
    # "JeremiahZ/roberta-base-cola",
    # "JeremiahZ/roberta-base-mrpc",
    # "JeremiahZ/roberta-base-qqp",
    # "JeremiahZ/roberta-base-stsb",
    # "l-yohai/bigbird-roberta-base-mnli",
    # "howey/roberta-large-squad2",
]
# textattack/roberta-base-RTE ({'accuracy': 0.7}, {'accuracy': 0.7})
# textattack/roberta-base-MNLI ({'accuracy': 0.3}, {'accuracy': 0.3})
# howey/roberta-large-rte ({'accuracy': 0.65}, {'accuracy': 0.65})
# howey/roberta-large-mnli ({'accuracy': 0.68}, {'accuracy': 0.68})
# howey/roberta-large-qnli ({'accuracy': 0.86}, {'accuracy': 0.86})
# howey/roberta-large-sst2 ({'accuracy': 0.77}, {'accuracy': 0.77})
# howey/roberta-large-cola ({'matthews_correlation': 0.19169538058831714}, {'matthews_correlation': 0.19169538058831714})
# howey/roberta-large-mrpc ({'accuracy': 0.61, 'f1': 0.6486486486486487}, {'accuracy': 0.61, 'f1': 0.6486486486486487})
# howey/roberta-large-qqp ({'accuracy': 0.77, 'f1': 0.5490196078431372}, {'accuracy': 0.77, 'f1': 0.5490196078431372})
# JeremiahZ/roberta-base-rte ({'accuracy': 0.61}, {'accuracy': 0.61})
# JeremiahZ/roberta-base-mnli ({'accuracy': 0.83}, {'accuracy': 0.83})
# JeremiahZ/roberta-base-qnli ({'accuracy': 0.82}, {'accuracy': 0.82})
# JeremiahZ/roberta-base-sst2 ({'accuracy': 0.89}, {'accuracy': 0.89})
# JeremiahZ/roberta-base-cola ({'matthews_correlation': 0.15285569591066622}, {'matthews_correlation': 0.15285569591066622})
# JeremiahZ/roberta-base-mrpc ({'accuracy': 0.33, 'f1': 0.10666666666666666}, {'accuracy': 0.33, 'f1': 0.10666666666666666})
# JeremiahZ/roberta-base-qqp ({'accuracy': 0.79, 'f1': 0.7341772151898734}, {'accuracy': 0.79, 'f1': 0.7341772151898734})


records = []
for model_id in model_ids:
    records.append(dict(model_id=model_id, results=str(test_weaver_cached(model_id))))

weaving_test_results_df = pd.DataFrame(records)
# weaving_test_results_df.to_csv("test-weaving-on-base-models.weaving_test_results.csv")
weaving_test_results_df

Unnamed: 0,model_id,results
0,textattack/roberta-base-RTE,"({'accuracy': 0.7}, {'accuracy': 0.7})"
1,textattack/roberta-base-MNLI,"({'accuracy': 0.3}, {'accuracy': 0.3})"


## Step get original model baselines

In [6]:
from llm_weaver import normalize_glue_task_name
from tqdm import tqdm

n_examples = 256

records = []
for split in tqdm(["train", "validation", "test"]):
    for model_id in tqdm(model_ids):
        records.append(
            {
                "model_id": model_id,
                "split": split,
                "score": get_score_from_named_model_cached(
                    model_id=model_id,
                    split=split,
                    n_examples=n_examples,
                    max_length=128,
                    batch_size=128,
                ),
                "n_examples": n_examples,
            }
        )
import pandas as pd

# Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
df = pd.DataFrame.from_records(records)
df = df.join(pd.json_normalize(df["score"])).drop(columns=["score"])
df["task"] = df["model_id"].apply(normalize_glue_task_name)
df["roberta"] = df["model_id"].apply(lambda x: "large" if "large" in x else "base")
# df = df[df["split"] == "train"]
# df = df[~df["accuracy"].isna()]
df = df.sort_values(["task", "roberta", "split"])
# replace nan with ''
df = df.fillna("")
df.to_csv("test-weaving-on-base-models.original-scores.csv", index=False)
df

100%|██████████| 16/16 [00:00<00:00, 1537.71it/s]
100%|██████████| 16/16 [00:00<00:00, 1681.76it/s]
100%|██████████| 16/16 [00:00<00:00, 1742.54it/s]
100%|██████████| 3/3 [00:00<00:00, 74.60it/s]


Unnamed: 0,model_id,split,n_examples,accuracy,matthews_correlation,f1,task,roberta
45,JeremiahZ/roberta-base-cola,test,256,,0.0,,cola,base
13,JeremiahZ/roberta-base-cola,train,256,,0.452776,,cola,base
29,JeremiahZ/roberta-base-cola,validation,256,,0.172932,,cola,base
38,howey/roberta-large-cola,test,256,,0.0,,cola,large
6,howey/roberta-large-cola,train,256,,0.451394,,cola,large
22,howey/roberta-large-cola,validation,256,,0.235292,,cola,large
33,textattack/roberta-base-MNLI,test,256,0.296875,,,mnli,base
42,JeremiahZ/roberta-base-mnli,test,256,0.339844,,,mnli,base
1,textattack/roberta-base-MNLI,train,256,0.234375,,,mnli,base
10,JeremiahZ/roberta-base-mnli,train,256,0.960938,,,mnli,base
