# Test the weaving code on the base models

In [1]:
# install dependencies

! pip install -q joblib  # joblib for memoizing functions
! pip install -q ipywidgets widgetsnbextension pandas-profiling # IProgress for progress bars

zsh:1: /home/brian/2023-fall-cs-194-294-merging-llms/.venv/bin/pip: bad interpreter: .venv/bin/python3.8: no such file or directory
zsh:1: /home/brian/2023-fall-cs-194-294-merging-llms/.venv/bin/pip: bad interpreter: .venv/bin/python3.8: no such file or directory


In [3]:
# Add model_merging to the python path

import os
import sys

model_merging_base = os.path.abspath("../model_merging/")
# assert it exist
assert os.path.exists(model_merging_base)
if model_merging_base not in sys.path:
    sys.path.append(model_merging_base)

In [4]:
# import joblib for caching and distributed computing
from math import sqrt

from joblib import Memory, Parallel, delayed

# memory = Memory(location="cache", verbose=10)
memory = Memory(location="cache", verbose=0)

parallel = Parallel(n_jobs=2, return_as="generator")
output_generator = parallel(delayed(sqrt)(i**2) for i in range(10))

In [5]:
# Imports and cached functions

import os

from llm_weaver import (
    calculate_score_from_weaving_config,
    get_score_from_named_model,
    test_weaver,
)

# Disable parallelism in tokenizers to avoid deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

calculate_score_from_weaving_config_cached = memory.cache(
    calculate_score_from_weaving_config
)
test_weaver_cached = memory.cache(test_weaver)

get_score_from_named_model_cached = memory.cache(get_score_from_named_model)

## Make sure you can build using `.build()`

In [6]:
import transformers
from llm_weaver import get_blank_model, get_model_config

if transformers.__version__ < "4.3.1":
    raise ValueError(
        "Need transformers >= 4.3.1, or something like that. Not sure of the version."
    )
    # https://github.com/huggingface/transformers/commit/4a55e4787760fdb6c40a972a60d814ba05425da1#diff-648ec06beb5ae6380c7f611a0f513a5d392509497d245a09f06b6549358afdffR1151

print(f"You have transformers version {transformers.__version__}!")

model = get_blank_model(get_model_config("textattack/roberta-base-RTE"))
model.build()

type(model)

You have transformers version 4.35.0!


2023-11-27 10:38:35.081384: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-27 10:38:35.081848: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, 

transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification

## Step 0: Get cross-task scores


In [7]:
model_ids = [
    "textattack/roberta-base-RTE",
    # "textattack/bert-base-uncased-RTE",
    "textattack/distilbert-base-uncased-RTE"
    # "textattack/roberta-base-MNLI",
    # "howey/roberta-large-rte",
    # "howey/roberta-large-mnli",
    # "JeremiahZ/roberta-base-rte",
    # "JeremiahZ/roberta-base-mnli",
]

model_ids_head_to_bodies = {head: model_ids for head in model_ids}
model_ids_head_to_bodies

{'textattack/roberta-base-RTE': ['textattack/roberta-base-RTE',
  'textattack/distilbert-base-uncased-RTE'],
 'textattack/distilbert-base-uncased-RTE': ['textattack/roberta-base-RTE',
  'textattack/distilbert-base-uncased-RTE']}

In [8]:
import numpy as np
from llm_weaver import dict_overwrite, get_model_config, normalize_glue_task_name

from model_merging import hdf5_util, sample_layers


def multi_task_configs_iter(model_ids_head_to_bodies, max_configs=None):
    num_configs = 0
    for head_model_id, body_model_ids in model_ids_head_to_bodies.items():
        for body_model_id in body_model_ids:
            # Use the task model as the "blank model"
            task = normalize_glue_task_name(head_model_id)
            head_model_config = get_model_config(head_model_id)
            body_model_config = get_model_config(body_model_id)
            config = {
                "glue_task": task,
                "tokenizer_model_id": head_model_id,
                # The task (i.e. the classification head output size should match the task at hand)
                "blank_model_config": dict_overwrite(
                    head_model_config,
                    {"num_hidden_layers": body_model_config["num_hidden_layers"]},
                ),
                # Layer assignments
                "layer_assignments": [
                    {
                        "type": "SingleLayer",
                        "params": {
                            "donor": body_model_id,
                            "hidden_layer_number": i,
                        },
                    }
                    for i in range(body_model_config["num_hidden_layers"])
                ],
                # The head (i.e. the classification head should match the task at hand)
                # THESE ARE DIFFERENT BETWEEN RTE AND MNLI
                "classification_head": {
                    "type": "SingleClassificationHead",
                    "params": {
                        "donor": head_model_id,
                    },
                },
                # The embeddings layer
                # THESE ARE DIFFERENT BETWEEN RTE AND MNLI
                "embeddings": {
                    "type": "SingleEmbeddings",
                    "params": {
                        # "donor": body_model_id,
                        "donor": head_model_id,
                    },
                },
            }
            num_configs += 1
            if max_configs and num_configs > max_configs:
                break
            yield config


list(
    multi_task_configs_iter(
        model_ids_head_to_bodies=model_ids_head_to_bodies,
        max_configs=4,
    )
)

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

[{'glue_task': 'rte',
  'tokenizer_model_id': 'textattack/roberta-base-RTE',
  'blank_model_config': {'return_dict': True,
   'output_hidden_states': False,
   'output_attentions': False,
   'torchscript': False,
   'torch_dtype': None,
   'use_bfloat16': False,
   'tf_legacy_loss': False,
   'pruned_heads': {},
   'tie_word_embeddings': True,
   'is_encoder_decoder': False,
   'is_decoder': False,
   'cross_attention_hidden_size': None,
   'add_cross_attention': False,
   'tie_encoder_decoder': False,
   'max_length': 20,
   'min_length': 0,
   'do_sample': False,
   'early_stopping': False,
   'num_beams': 1,
   'num_beam_groups': 1,
   'diversity_penalty': 0.0,
   'temperature': 1.0,
   'top_k': 50,
   'top_p': 1.0,
   'typical_p': 1.0,
   'repetition_penalty': 1.0,
   'length_penalty': 1.0,
   'no_repeat_ngram_size': 0,
   'encoder_no_repeat_ngram_size': 0,
   'bad_words_ids': None,
   'num_return_sequences': 1,
   'chunk_size_feed_forward': 0,
   'output_scores': False,
   'return

## Step get original model baselines

In [9]:
from llm_weaver import normalize_glue_task_name
from tqdm import tqdm

n_examples = 256

records = []
for split in tqdm(
    [
        # "train",
        "validation",
        # "test"
    ]
):
    for config in tqdm(
        multi_task_configs_iter(
            model_ids_head_to_bodies=model_ids_head_to_bodies,
        )
    ):
        records.append(
            {
                "task": config["glue_task"],
                "classification_head_model": config["classification_head"]["params"][
                    "donor"
                ],
                "layers_models": list(
                    sorted(
                        set(
                            [
                                layer["params"]["donor"]
                                for layer in config["layer_assignments"]
                            ]
                        )
                    )
                ),
                "score": calculate_score_from_weaving_config_cached(
                    weaving_config=config,
                    split=split,
                    n_examples=n_examples,
                ),
                "split": split,
                "n_examples": n_examples,
            }
        )
import pandas as pd

# Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
df = pd.DataFrame.from_records(records)
df = df.join(pd.json_normalize(df["score"])).drop(columns=["score"])
# df["task"] = df["model_id"].apply(normalize_glue_task_name)
# df["roberta"] = df["model_id"].apply(lambda x: "large" if "large" in x else "base")
# df = df[df["split"] == "train"]
# df = df[~df["accuracy"].isna()]
# df = df.sort_values(["task", "roberta", "split"])
# replace nan with ''
df = df.fillna("")
# df.to_csv("test-weaving-on-base-models.original-scores.csv", index=False)
df

  0%|          | 0/1 [00:00<?, ?it/s]All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of th

calculating score for weaving config md5sum: c523724abaf659994a2680f089e4610a
Loading textattack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
You are using a model of type distilbert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


Loading textattack/distilbert-base-uncased-RTE


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.1.attention.k_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.4.attention.k_lin.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.v_lin.weight', 'distilbert.transformer.layer.2.attention.v_lin.weight', 'distilbert.transformer.layer.1.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.3.output_layer_norm.weight', 'distilbert.transformer.layer.4.ffn.lin1.weight', 'distilbert.transformer.layer.2.ffn.lin1.w



  return hfds.load_metric("glue", task)




2023-11-27 10:39:29.269835: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
You are using a model of type distilbert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.1.attention.k_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.4.attention.k_lin.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.v_lin.weight', 'distilbert.transformer.layer.2.attention.v_lin.weight', 'distilbert.transformer.layer.1.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.3.output_layer_norm.weight', 'distilbert.transformer.layer.4.ffn.lin1.weight', 'distilbert.transformer.layer.2.ffn.lin1.w

calculating score for weaving config md5sum: 950868d95040810431f2959422e87d51
Loading textattack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
You are using a model of type distilbert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


Loading textattack/distilbert-base-uncased-RTE


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.1.attention.k_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.4.attention.k_lin.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.v_lin.weight', 'distilbert.transformer.layer.2.attention.v_lin.weight', 'distilbert.transformer.layer.1.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.3.output_layer_norm.weight', 'distilbert.transformer.layer.4.ffn.lin1.weight', 'distilbert.transformer.layer.2.ffn.lin1.w



Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]





2023-11-27 10:39:59.854676: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
You are using a model of type distilbert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.1.attention.k_lin.weight', 'distilbert.trans

calculating score for weaving config md5sum: c0f1099f5513b5e5324caac782631618


You are using a model of type distilbert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


Loading textattack/distilbert-base-uncased-RTE


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.1.attention.k_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.4.attention.k_lin.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.3.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.v_lin.weight', 'distilbert.transformer.layer.2.attention.v_lin.weight', 'distilbert.transformer.layer.1.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.3.output_layer_norm.weight', 'distilbert.transformer.layer.4.ffn.lin1.weight', 'distilbert.transformer.layer.2.ffn.lin1.w







2023-11-27 10:40:24.429033: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
4it [01:24, 21.23s/it]
100%|██████████| 1/1 [01:24<00:00, 84.93s/it]






Unnamed: 0,task,classification_head_model,layers_models,split,n_examples,accuracy
0,rte,textattack/roberta-base-RTE,[textattack/roberta-base-RTE],validation,256,0.726562
1,rte,textattack/roberta-base-RTE,[textattack/distilbert-base-uncased-RTE],validation,256,0.519531
2,rte,textattack/distilbert-base-uncased-RTE,[textattack/roberta-base-RTE],validation,256,0.476562
3,rte,textattack/distilbert-base-uncased-RTE,[textattack/distilbert-base-uncased-RTE],validation,256,0.523438


In [14]:
from llm_weaver import get_model

model = get_model("textattack/distilbert-base-uncased-RTE")

for item in model.weights:
    print(item.name, item.shape)

You are using a model of type distilbert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.k_lin.weight', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.2.attention.out_lin.weight', 'distilbert.transformer.layer.1.sa_layer_norm.bias', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.transformer.layer.2.ffn.lin2.weight', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.bias', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distil

tf_roberta_for_sequence_classification_22/roberta/encoder/layer_._0/attention/self/query/kernel:0 (768, 768)
tf_roberta_for_sequence_classification_22/roberta/encoder/layer_._0/attention/self/query/bias:0 (768,)
tf_roberta_for_sequence_classification_22/roberta/encoder/layer_._0/attention/self/key/kernel:0 (768, 768)
tf_roberta_for_sequence_classification_22/roberta/encoder/layer_._0/attention/self/key/bias:0 (768,)
tf_roberta_for_sequence_classification_22/roberta/encoder/layer_._0/attention/self/value/kernel:0 (768, 768)
tf_roberta_for_sequence_classification_22/roberta/encoder/layer_._0/attention/self/value/bias:0 (768,)
tf_roberta_for_sequence_classification_22/roberta/encoder/layer_._0/attention/output/dense/kernel:0 (768, 768)
tf_roberta_for_sequence_classification_22/roberta/encoder/layer_._0/attention/output/dense/bias:0 (768,)
tf_roberta_for_sequence_classification_22/roberta/encoder/layer_._0/attention/output/LayerNorm/gamma:0 (768,)
tf_roberta_for_sequence_classification_22/

In [10]:
from llm_weaver import get_model

model = get_model("textattack/roberta-base-RTE")

for item in model.weights:
    print(item.name, item.shape)

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


tf_roberta_for_sequence_classification_26/roberta/encoder/layer_._0/attention/self/query/kernel:0 (768, 768)
tf_roberta_for_sequence_classification_26/roberta/encoder/layer_._0/attention/self/query/bias:0 (768,)
tf_roberta_for_sequence_classification_26/roberta/encoder/layer_._0/attention/self/key/kernel:0 (768, 768)
tf_roberta_for_sequence_classification_26/roberta/encoder/layer_._0/attention/self/key/bias:0 (768,)
tf_roberta_for_sequence_classification_26/roberta/encoder/layer_._0/attention/self/value/kernel:0 (768, 768)
tf_roberta_for_sequence_classification_26/roberta/encoder/layer_._0/attention/self/value/bias:0 (768,)
tf_roberta_for_sequence_classification_26/roberta/encoder/layer_._0/attention/output/dense/kernel:0 (768, 768)
tf_roberta_for_sequence_classification_26/roberta/encoder/layer_._0/attention/output/dense/bias:0 (768,)
tf_roberta_for_sequence_classification_26/roberta/encoder/layer_._0/attention/output/LayerNorm/gamma:0 (768,)
tf_roberta_for_sequence_classification_26/

In [None]:
from llm_weaver import get_model

model = get_model("textattack/roberta-base-RTE")

for item in model.weights:
    print(item.name, item.shape)

In [None]:
df.to_csv("get-multi-task-scores.csv", index=False)

In [None]:
df.sort_values(["accuracy", "task", "split"], ascending=False)

Unnamed: 0,task,classification_head_model,layers_models,split,n_examples,accuracy
11,mnli,JeremiahZ/roberta-base-mnli,[JeremiahZ/roberta-base-mnli],validation,256,0.855469
9,rte,JeremiahZ/roberta-base-rte,[JeremiahZ/roberta-base-mnli],validation,256,0.746094
0,rte,textattack/roberta-base-RTE,[textattack/roberta-base-RTE],validation,256,0.726562
7,mnli,howey/roberta-large-mnli,[howey/roberta-large-mnli],validation,256,0.707031
4,rte,howey/roberta-large-rte,[howey/roberta-large-rte],validation,256,0.644531
8,rte,JeremiahZ/roberta-base-rte,[JeremiahZ/roberta-base-rte],validation,256,0.621094
1,rte,textattack/roberta-base-RTE,[textattack/roberta-base-MNLI],validation,256,0.515625
6,mnli,howey/roberta-large-mnli,[howey/roberta-large-rte],validation,256,0.390625
10,mnli,JeremiahZ/roberta-base-mnli,[JeremiahZ/roberta-base-rte],validation,256,0.289062
3,mnli,textattack/roberta-base-MNLI,[textattack/roberta-base-MNLI],validation,256,0.265625
