# Test the weaving code on the base models

In [1]:
# install dependencies

! pip install -q joblib  # joblib for memoizing functions
! pip install -q ipywidgets widgetsnbextension pandas-profiling # IProgress for progress bars

zsh:1: /home/brian/2023-fall-cs-194-294-merging-llms/.venv/bin/pip: bad interpreter: .venv/bin/python3.8: no such file or directory
zsh:1: /home/brian/2023-fall-cs-194-294-merging-llms/.venv/bin/pip: bad interpreter: .venv/bin/python3.8: no such file or directory


In [2]:
# Add model_merging to the python path

import os
import sys

model_merging_base = os.path.abspath("../model_merging/")
# assert it exist
assert os.path.exists(model_merging_base)
if model_merging_base not in sys.path:
    sys.path.append(model_merging_base)

In [3]:
# import joblib for caching and distributed computing
from math import sqrt

from joblib import Memory, Parallel, delayed

# memory = Memory(location="cache", verbose=10)
memory = Memory(location="cache", verbose=0)

parallel = Parallel(n_jobs=2, return_as="generator")
output_generator = parallel(delayed(sqrt)(i**2) for i in range(10))

In [4]:
# Imports and cached functions

import os

from llm_weaver import (
    calculate_score_from_weaving_config,
    get_score_from_named_model,
    test_weaver,
    ca,
)

# Disable parallelism in tokenizers to avoid deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

calculate_score_from_weaving_config_cached = memory.cache(
    calculate_score_from_weaving_config
)
test_weaver_cached = memory.cache(test_weaver)

get_score_from_named_model_cached = memory.cache(get_score_from_named_model)

2023-11-26 15:06:08.264426: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-26 15:06:08.298760: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-26 15:06:08.299324: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Make sure you can build using `.build()`

In [5]:
import transformers
from llm_weaver import get_blank_model, get_model_config

if transformers.__version__ < "4.3.1":
    raise ValueError(
        "Need transformers >= 4.3.1, or something like that. Not sure of the version."
    )
    # https://github.com/huggingface/transformers/commit/4a55e4787760fdb6c40a972a60d814ba05425da1#diff-648ec06beb5ae6380c7f611a0f513a5d392509497d245a09f06b6549358afdffR1151

print(f"You have transformers version {transformers.__version__}!")

model = get_blank_model(get_model_config("textattack/roberta-base-RTE"))
model.build()

type(model)

You have transformers version 4.35.0!


2023-11-26 15:06:11.802542: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-26 15:06:11.803269: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, 

transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification

## Step 0: Get cross-task scores


In [9]:
model_ids_grouped = [
    [
        "textattack/roberta-base-RTE",
        "textattack/roberta-base-MNLI",
    ],  # <--- this one has a very low score
    ["howey/roberta-large-rte", "howey/roberta-large-mnli"],
    # "howey/roberta-large-qnli",
    # "howey/roberta-large-sst2",
    # "howey/roberta-large-cola",
    # "howey/roberta-large-mrpc",
    # "howey/roberta-large-qqp",
    # "howey/roberta-large-stsb",
    ["JeremiahZ/roberta-base-rte", "JeremiahZ/roberta-base-mnli"],
    # "JeremiahZ/roberta-base-qnli",
    # "JeremiahZ/roberta-base-sst2",
    # "JeremiahZ/roberta-base-cola",
    # "JeremiahZ/roberta-base-mrpc",
    # "JeremiahZ/roberta-base-qqp",
    # "JeremiahZ/roberta-base-stsb",
    # "l-yohai/bigbird-roberta-base-mnli",
    # "howey/roberta-large-squad2",
]

In [16]:
import numpy as np
from llm_weaver import dict_overwrite, get_model_config, normalize_glue_task_name

from model_merging import hdf5_util, sample_layers


def multi_task_configs_iter(model_ids_grouped, max_configs=None):
    # Extract the task names from the model ids
    task_to_model_ids_map_list = [
        {normalize_glue_task_name(model_id): model_id for model_id in group}
        for group in model_ids_grouped
    ]

    num_configs = 0
    for task_to_model_ids_map in task_to_model_ids_map_list:
        tasks = list(task_to_model_ids_map.keys())
        model_ids = list(task_to_model_ids_map.values())

        for task in tasks:
            # Use the task model as the "blank model"
            task_model_id = task_to_model_ids_map[task]
            task_model_config = get_model_config(task_model_id)

            for model_id in model_ids:
                config = {
                    "glue_task": task,
                    "tokenizer_model_id": task_model_id,
                    # The task (i.e. the classification head output size should match the task at hand)
                    "blank_model_config": task_model_config,
                    # Layer assignments
                    "layer_assignments": [
                        {
                            "type": "SingleLayer",
                            "params": {
                                "donor": model_id,
                                "hidden_layer_number": i,
                            },
                        }
                        for i in range(task_model_config["num_hidden_layers"])
                    ],
                    # The head (i.e. the classification head should match the task at hand)
                    # THESE ARE DIFFERENT BETWEEN RTE AND MNLI
                    "classification_head": {
                        "type": "SingleClassificationHead",
                        "params": {
                            "donor": task_model_id,
                        },
                    },
                    # The embeddings layer
                    # THESE ARE DIFFERENT BETWEEN RTE AND MNLI
                    "embeddings": {
                        "type": "SingleEmbeddings",
                        "params": {
                            "donor": model_id,
                        },
                    },
                }
                num_configs += 1
                if max_configs and num_configs > max_configs:
                    break
                yield config
        # yield generate_layer_config(fishers, sample_config)
    # # Need to decide what to do about embeddings and classification head.


# sample_layers.generate_fisher_distributions(fishers)


len(
    list(
        multi_task_configs_iter(
            model_ids_grouped=model_ids_grouped,
            # max_configs=4,
        )
    )
)

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another t

12

## Step get original model baselines

In [6]:
from llm_weaver import normalize_glue_task_name
from tqdm import tqdm

n_examples = 256

records = []
for split in tqdm(
    [
        # "train",
        "validation",
        # "test"
    ]
):
    for config in tqdm(
        multi_task_configs_iter(
            model_ids_grouped=model_ids_grouped,
        )
    ):
        records.append(
            {
                "task": config["glue_task"],
                "classification_head_model": config["classification_head"]["params"][
                    "donor"
                ],
                "layers_models": list(
                    sorted(
                        set(
                            [
                                layer["params"]["donor"]
                                for layer in config["layer_assignments"]
                            ]
                        )
                    )
                ),
                "score": calculate_score_from_weaving_config_cached(
                    weaving_config=config,
                    split=split,
                    n_examples=n_examples,
                ),
                "split": split,
                "n_examples": n_examples,
            }
        )
import pandas as pd

# Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
df = pd.DataFrame.from_records(records)
df = df.join(pd.json_normalize(df["score"])).drop(columns=["score"])
# df["task"] = df["model_id"].apply(normalize_glue_task_name)
# df["roberta"] = df["model_id"].apply(lambda x: "large" if "large" in x else "base")
# df = df[df["split"] == "train"]
# df = df[~df["accuracy"].isna()]
# df = df.sort_values(["task", "roberta", "split"])
# replace nan with ''
df = df.fillna("")
# df.to_csv("test-weaving-on-base-models.original-scores.csv", index=False)
df

100%|██████████| 16/16 [00:00<00:00, 1537.71it/s]
100%|██████████| 16/16 [00:00<00:00, 1681.76it/s]
100%|██████████| 16/16 [00:00<00:00, 1742.54it/s]
100%|██████████| 3/3 [00:00<00:00, 74.60it/s]


Unnamed: 0,model_id,split,n_examples,accuracy,matthews_correlation,f1,task,roberta
45,JeremiahZ/roberta-base-cola,test,256,,0.0,,cola,base
13,JeremiahZ/roberta-base-cola,train,256,,0.452776,,cola,base
29,JeremiahZ/roberta-base-cola,validation,256,,0.172932,,cola,base
38,howey/roberta-large-cola,test,256,,0.0,,cola,large
6,howey/roberta-large-cola,train,256,,0.451394,,cola,large
22,howey/roberta-large-cola,validation,256,,0.235292,,cola,large
33,textattack/roberta-base-MNLI,test,256,0.296875,,,mnli,base
42,JeremiahZ/roberta-base-mnli,test,256,0.339844,,,mnli,base
1,textattack/roberta-base-MNLI,train,256,0.234375,,,mnli,base
10,JeremiahZ/roberta-base-mnli,train,256,0.960938,,,mnli,base
