# End-to-end notebook!

Here is the workflow:
* Sampling configs (sampling parameters, etc.) lead to...
* Weaving configs (blank model settings, donor model settings, layer assignments) lead to...
* Models (probably TFRobertaForSequenceClassification in all cases) lead to...
* Performance scores (numbers from 0-100)

In [1]:
# install dependencies

! pip install -q joblib  # joblib for memoizing functions
! pip install -q ipywidgets widgetsnbextension pandas-profiling # IProgress for progress bars

# ! pip install -q tensorflow==2.13.0 tensorflow-datasets==4.9.2 tensorflow-probability==0.21.0 transformers==4.35.0  datasets==2.14.6 torch==2.1.0 scipy==1.10.1 scikit-learn==1.3.2

[0m

In [2]:
# Add model_merging to the python path

import os
import sys

model_merging_base = os.path.abspath("../model_merging/")
# assert it exist
assert os.path.exists(model_merging_base)
if model_merging_base not in sys.path:
    sys.path.append(model_merging_base)

In [3]:
# import joblib for caching and distributed computing
from math import sqrt

from joblib import Memory, Parallel, delayed

memory = Memory(location="cache", verbose=10)

parallel = Parallel(n_jobs=2, return_as="generator")
output_generator = parallel(delayed(sqrt)(i**2) for i in range(10))

In [4]:
# Imports and cached functions

import os

from llm_weaver import (
    calculate_score_from_weaving_config,
    test_weaver,
)

# Disable parallelism in tokenizers to avoid deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

calculate_score_from_weaving_config_cached = memory.cache(
    calculate_score_from_weaving_config
)
test_weaver_cached = memory.cache(test_weaver)

## Step 0: Get RTE scores

* RTE vanilla
* RTE isotropically merged with MNLI score with a weight chosen properly
* RTE fisher merge with MNLI with a weight chosen properly
* replacing with certain layers?
* Shifting?

## Steps: configs to graph


In [5]:
model_id = "textAttack/roberta-base-RTE"

In [13]:
import pandas as pd
from llm_weaver import dict_overwrite, get_model_config, normalize_glue_task_name

model_id = "textAttack/roberta-base-RTE"


def RTEVanilla(model_id):
    num_layers = get_model_config(model_id)["num_hidden_layers"]
    layer_assignments = [
        {
            "type": "SingleLayer",
            "params": {
                "donor": model_id,
                "hidden_layer_number": i,
            },
        }
        for i in range(num_layers)
    ]

    blank_model_config = dict_overwrite(
        get_model_config(model_id),
        {
            "num_hidden_layers": len(layer_assignments),
        },
    )
    config = {
        "glue_task": normalize_glue_task_name(model_id),
        "tokenizer_model_id": model_id,
        "blank_model_config": blank_model_config,
        "layer_assignments": layer_assignments,
        "classification_head": {
            "type": "SingleClassificationHead",
            "params": {
                "donor": model_id,
            },
        },
        "embeddings": {
            "type": "SingleEmbeddings",
            "params": {
                "donor": model_id,
            },
        },
    }

    yield config


weave_configs = list(RTEVanilla(model_id))

scores = Parallel(n_jobs=5, return_as="list")(
    delayed(calculate_score_from_weaving_config_cached)(
        weave_config,
        # n_examples=4096,
        n_examples=128,
        split="validation",
    )
    for weave_config in weave_configs
)
accuracies = [score["accuracy"] for score in scores]

records = []
for weave_config, accuracy in zip(weave_configs, accuracies):
    record = {}
    record["name"] = "RTEVanilla"
    record["accuracy"] = accuracy
    records.append(record)
df_rte_vanilla = pd.DataFrame.from_records(records)
df_rte_vanilla

[Memory]: Loading calculate_score_from_weaving_config from cache/joblib/llm_weaver/calculate_score_from_weaving_config/5dd88dc88dcbcca6c00d08955e614e0d
_________________calculate_score_from_weaving_config cache loaded - 0.0s, 0.0min


Unnamed: 0,name,accuracy
0,RTEVanilla,0.726562


In [15]:
import pandas as pd
from llm_weaver import dict_overwrite, get_model_config, normalize_glue_task_name

model_id = "textAttack/roberta-base-RTE"


def RTEMNLIIsotropic(model_id):
    for alpha in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1.0]:
        num_layers = get_model_config(model_id)["num_hidden_layers"]
        layer_assignments = [
            {
                "type": "IsotropicLinearCombination",
                "params": {
                    "donors": [
                        {"donor": model_id, "hidden_layer_number": i, "weight": alpha},
                        {
                            "donor": "textAttack/roberta-base-MNLI",
                            "hidden_layer_number": i,
                            "weight": 1.0 - alpha,
                        },
                    ]
                },
            }
            for i in range(num_layers)
        ]

        blank_model_config = dict_overwrite(
            get_model_config(model_id),
            {
                "num_hidden_layers": len(layer_assignments),
            },
        )
        config = {
            "glue_task": normalize_glue_task_name(model_id),
            "tokenizer_model_id": model_id,
            "blank_model_config": blank_model_config,
            "layer_assignments": layer_assignments,
            "classification_head": {
                "type": "SingleClassificationHead",
                "params": {
                    "donor": model_id,
                },
            },
            "embeddings": {
                "type": "SingleEmbeddings",
                "params": {
                    "donor": model_id,
                },
            },
        }

        yield config


weave_configs = list(RTEMNLIIsotropic(model_id))

scores = Parallel(n_jobs=5, return_as="list")(
    delayed(calculate_score_from_weaving_config_cached)(
        weave_config,
        # n_examples=4096,
        n_examples=128,
        split="validation",
    )
    for weave_config in weave_configs
)
accuracies = [score["accuracy"] for score in scores]

records = []
for weave_config, accuracy in zip(weave_configs, accuracies):
    record = {}
    record["name"] = "RTEMNLIIsotropic"
    record["accuracy"] = accuracy
    records.append(record)
df_rte_vanilla = pd.DataFrame.from_records(records)
df_rte_vanilla

________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., n_examples=128, split='validation')
calculating score for weaving config md5sum: 4211acb62720b8f6ecd6452e8fe39d

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Loading textAttack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Loading textAttack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Loading textAttack/roberta-base-RTE


2023-11-28 14:35:09.015958: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 14:35:11.035174: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 14:35:12.356048: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. I

_____________________________calculate_score_from_weaving_config - 63.1s, 1.1min
________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., n_examples=128, split='validat

2023-11-28 14:36:04.328508: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 14:36:08.480609: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 14:36:15.703373: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. I

_____________________________calculate_score_from_weaving_config - 53.8s, 0.9min
_____________________________calculate_score_from_weaving_config - 56.6s, 0.9min
_____________________________calculate_score_from_weaving_config - 61.1s, 1.0min
_____________________________calculate_score_from_weaving_config - 64.3s, 1.1min
_____________________________calculate_score_from_weaving_config - 65.4s, 1.1min


Unnamed: 0,name,accuracy
0,RTEMNLIIsotropic,0.53125
1,RTEMNLIIsotropic,0.5625
2,RTEMNLIIsotropic,0.5625
3,RTEMNLIIsotropic,0.578125
4,RTEMNLIIsotropic,0.585938
5,RTEMNLIIsotropic,0.609375
6,RTEMNLIIsotropic,0.742188
7,RTEMNLIIsotropic,0.734375
8,RTEMNLIIsotropic,0.726562
9,RTEMNLIIsotropic,0.726562


In [9]:
import pandas as pd
from llm_weaver import dict_overwrite, get_model_config, normalize_glue_task_name

model_id = "textAttack/roberta-base-RTE"


def RTEMNLIIsotropicMarenLayers(model_id):
    replacement_layers = [0, 1, 4, 11]
    for alpha in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1.0]:
        num_layers = get_model_config(model_id)["num_hidden_layers"]
        layer_assignments = [
            {
                "type": "IsotropicLinearCombination",
                "params": {
                    "donors": [
                        {
                            "donor": model_id,
                            "hidden_layer_number": i,
                            "weight": alpha if (i in replacement_layers) else 1.0,
                        },
                        {
                            "donor": "textAttack/roberta-base-MNLI",
                            "hidden_layer_number": i,
                            "weight": (1.0 - alpha)
                            if (i in replacement_layers)
                            else 0.0,
                        },
                    ]
                },
            }
            for i in range(num_layers)
        ]

        blank_model_config = dict_overwrite(
            get_model_config(model_id),
            {
                "num_hidden_layers": len(layer_assignments),
            },
        )
        config = {
            "glue_task": normalize_glue_task_name(model_id),
            "tokenizer_model_id": model_id,
            "blank_model_config": blank_model_config,
            "layer_assignments": layer_assignments,
            "classification_head": {
                "type": "SingleClassificationHead",
                "params": {
                    "donor": model_id,
                },
            },
            "embeddings": {
                "type": "SingleEmbeddings",
                "params": {
                    "donor": model_id,
                },
            },
        }

        yield config


weave_configs = list(RTEMNLIIsotropicMarenLayers(model_id))

scores = Parallel(n_jobs=5, return_as="list")(
    delayed(calculate_score_from_weaving_config_cached)(
        weave_config,
        # n_examples=4096,
        n_examples=128,
        split="validation",
    )
    for weave_config in weave_configs
)
accuracies = [score["accuracy"] for score in scores]

records = []
for weave_config, accuracy in zip(weave_configs, accuracies):
    record = {}
    record["name"] = "RTEMNLIIsotropicMarenLayers"
    record["accuracy"] = accuracy
    records.append(record)
df_rte_vanilla = pd.DataFrame.from_records(records)
df_rte_vanilla

________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., n_examples=128, split='validation')
___________________________________________________________________________

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Loading textAttack/roberta-base-MNLI


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-MNLI
Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

_____________________________calculate_score_from_weaving_config - 29.5s, 0.5min
________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., n_examples=128, split='validat

2023-11-28 15:08:11.733240: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 15:08:14.638119: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 15:08:15.252743: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. I

_____________________________calculate_score_from_weaving_config - 21.9s, 0.4min
_____________________________calculate_score_from_weaving_config - 22.9s, 0.4min
_____________________________calculate_score_from_weaving_config - 22.4s, 0.4min
_____________________________calculate_score_from_weaving_config - 22.6s, 0.4min


Unnamed: 0,name,accuracy
0,RTEMNLIIsotropicMarenLayers,0.710938
1,RTEMNLIIsotropicMarenLayers,0.734375
2,RTEMNLIIsotropicMarenLayers,0.734375
3,RTEMNLIIsotropicMarenLayers,0.75
4,RTEMNLIIsotropicMarenLayers,0.773438
5,RTEMNLIIsotropicMarenLayers,0.765625
6,RTEMNLIIsotropicMarenLayers,0.75
7,RTEMNLIIsotropicMarenLayers,0.734375
8,RTEMNLIIsotropicMarenLayers,0.734375
9,RTEMNLIIsotropicMarenLayers,0.726562


In [6]:
import pandas as pd
from llm_weaver import dict_overwrite, get_model_config, normalize_glue_task_name

model_id = "textAttack/roberta-base-RTE"


def FisherAllLayers(model_id):
    for alpha in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1.0]:
        num_layers = get_model_config(model_id)["num_hidden_layers"]
        layer_assignments = [
            {
                "type": "ElementWiseLinearCombination",
                "params": {
                    "donors": [
                        {
                            "donor": model_id,
                            "hidden_layer_number": i,
                            "weight": alpha,
                            "element_wise_multiplier_filename": f"../data/fisher_info/{model_id.replace('/', '_')}-fisher-info.h5",
                        },
                        {
                            "donor": "textAttack/roberta-base-MNLI",
                            "hidden_layer_number": i,
                            "weight": 1.0 - alpha,
                            "element_wise_multiplier_filename": "../data/fisher_info/textAttack_roberta-base-MNLI-fisher-info.h5",
                        },
                    ],
                    "normalize": True,
                },
            }
            for i in range(num_layers)
        ]

        blank_model_config = dict_overwrite(
            get_model_config(model_id),
            {
                "num_hidden_layers": len(layer_assignments),
            },
        )
        config = {
            "glue_task": normalize_glue_task_name(model_id),
            "tokenizer_model_id": model_id,
            "blank_model_config": blank_model_config,
            "layer_assignments": layer_assignments,
            "classification_head": {
                "type": "SingleClassificationHead",
                "params": {
                    "donor": model_id,
                },
            },
            "embeddings": {
                "type": "SingleEmbeddings",
                "params": {
                    "donor": model_id,
                },
            },
        }

        yield config


weave_configs = list(FisherAllLayers(model_id))

scores = Parallel(n_jobs=5, return_as="list")(
    delayed(calculate_score_from_weaving_config_cached)(
        weave_config,
        # n_examples=4096,
        n_examples=129,
        split="validation",
    )
    for weave_config in weave_configs
)
accuracies = [score["accuracy"] for score in scores]

records = []
for weave_config, accuracy in zip(weave_configs, accuracies):
    record = {}
    record["name"] = "FisherAllLayers"
    record["accuracy"] = accuracy
    records.append(record)
df_rte_vanilla = pd.DataFrame.from_records(records)
df_rte_vanilla

________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., n_examples=129, split='validation')
calculating score for weaving config md5sum: b6705706a1a9c88694c11ef5abe3e3

2023-11-28 15:24:56.487792: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 15:24:57.593469: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 15:24:57.987041: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. I

_____________________________calculate_score_from_weaving_config - 27.1s, 0.5min
_____________________________calculate_score_from_weaving_config - 29.1s, 0.5min
_____________________________calculate_score_from_weaving_config - 29.2s, 0.5min
_____________________________calculate_score_from_weaving_config - 29.0s, 0.5min
_____________________________calculate_score_from_weaving_config - 29.3s, 0.5min




________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., n_examples=129, split='validation')____________________________________________________________________________

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-MNLI


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

Loading textAttack/roberta-base-MNLI


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Loading textAttack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
  return hfds.load_metric("glue", task)
  return hfds.load_metric("glue", task)
  return hfds.load_metric("glue", task)
  return hfds.load_metric("glue", task)
  return hfds.load_metric("glue", task)
2023-11-28 15:25:35.221608: W tensorflow/core/kernels/data/cache_dataset_o

_____________________________calculate_score_from_weaving_config - 32.8s, 0.5min
_____________________________calculate_score_from_weaving_config - 34.4s, 0.6min
_____________________________calculate_score_from_weaving_config - 34.0s, 0.6min
_____________________________calculate_score_from_weaving_config - 33.7s, 0.6min
_____________________________calculate_score_from_weaving_config - 32.8s, 0.5min


Unnamed: 0,name,accuracy
0,FisherAllLayers,0.534884
1,FisherAllLayers,0.565891
2,FisherAllLayers,0.565891
3,FisherAllLayers,0.581395
4,FisherAllLayers,0.589147
5,FisherAllLayers,0.612403
6,FisherAllLayers,0.744186
7,FisherAllLayers,0.744186
8,FisherAllLayers,0.728682
9,FisherAllLayers,0.728682


In [5]:
import pandas as pd
from llm_weaver import dict_overwrite, get_model_config, normalize_glue_task_name

model_id = "textAttack/roberta-base-RTE"


def FisherMARENSLayers(model_id):
    replacement_layers = [0, 1, 4, 11]
    for alpha in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1.0]:
        num_layers = get_model_config(model_id)["num_hidden_layers"]
        layer_assignments = [
            (
                {
                    "type": "ElementWiseLinearCombination",
                    "params": {
                        "donors": [
                            {
                                "donor": model_id,
                                "hidden_layer_number": i,
                                "weight": alpha,
                                "element_wise_multiplier_filename": f"../data/fisher_info/{model_id.replace('/', '_')}-fisher-info.h5",
                            },
                            {
                                "donor": "textAttack/roberta-base-MNLI",
                                "hidden_layer_number": i,
                                "weight": 1.0 - alpha,
                                "element_wise_multiplier_filename": "../data/fisher_info/textAttack_roberta-base-MNLI-fisher-info.h5",
                            },
                        ],
                        "normalize": True,
                    },
                }
                if (i in replacement_layers)
                else {
                    "type": "SingleLayer",
                    "params": {
                        "donor": model_id,
                        "hidden_layer_number": i,
                    },
                }
            )
            for i in range(num_layers)
        ]

        blank_model_config = dict_overwrite(
            get_model_config(model_id),
            {
                "num_hidden_layers": len(layer_assignments),
            },
        )
        config = {
            "glue_task": normalize_glue_task_name(model_id),
            "tokenizer_model_id": model_id,
            "blank_model_config": blank_model_config,
            "layer_assignments": layer_assignments,
            "classification_head": {
                "type": "SingleClassificationHead",
                "params": {
                    "donor": model_id,
                },
            },
            "embeddings": {
                "type": "SingleEmbeddings",
                "params": {
                    "donor": model_id,
                },
            },
        }

        yield config


weave_configs = list(FisherMARENSLayers(model_id))

scores = Parallel(n_jobs=5, return_as="list")(
    delayed(calculate_score_from_weaving_config_cached)(
        weave_config,
        # n_examples=4096,
        n_examples=128,
        split="validation",
    )
    for weave_config in weave_configs
)
accuracies = [score["accuracy"] for score in scores]

records = []
for weave_config, accuracy in zip(weave_configs, accuracies):
    record = {}
    record["name"] = "FisherMARENSLayers"
    record["accuracy"] = accuracy
    records.append(record)
df_rte_vanilla = pd.DataFrame.from_records(records)
df_rte_vanilla

________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., n_examples=128, split='validation')
calculating score for weaving config md5sum: ee03e67dc871bb16971869b787d611

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Loading textAttack/roberta-base-MNLI
Loading textAttack/roberta-base-MNLI


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-MNLI


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

_____________________________calculate_score_from_weaving_config - 33.9s, 0.6min
_____________________________calculate_score_from_weaving_config - 34.8s, 0.6min
_____________________________calculate_score_from_weaving_config - 35.9s, 0.6min
_____________________________calculate_score_from_weaving_config - 36.0s, 0.6min
_____________________________calculate_score_from_weaving_config - 36.0s, 0.6min




________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., n_examples=128, split='validation')
calculating score for weaving config md5sum: 0671b4be4ed4baaac504493a6e14f6

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Loading textAttack/roberta-base-MNLI
Loading textAttack/roberta-base-MNLI
Loading textAttack/roberta-base-MNLI
________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., 

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Loading textAttack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

Loading textAttack/roberta-base-MNLI


  return hfds.load_metric("glue", task)
  return hfds.load_metric("glue", task)
  return hfds.load_metric("glue", task)
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
  return hfds.load_metric("glue", task)
  return hfds.load_metric("glue", task)
2023-11-28 15:24:24.507081: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 15

_____________________________calculate_score_from_weaving_config - 30.6s, 0.5min
_____________________________calculate_score_from_weaving_config - 32.0s, 0.5min
_____________________________calculate_score_from_weaving_config - 31.4s, 0.5min
_____________________________calculate_score_from_weaving_config - 31.9s, 0.5min
_____________________________calculate_score_from_weaving_config - 31.3s, 0.5min


Unnamed: 0,name,accuracy
0,FisherMARENSLayers,0.710938
1,FisherMARENSLayers,0.734375
2,FisherMARENSLayers,0.734375
3,FisherMARENSLayers,0.75
4,FisherMARENSLayers,0.765625
5,FisherMARENSLayers,0.765625
6,FisherMARENSLayers,0.734375
7,FisherMARENSLayers,0.726562
8,FisherMARENSLayers,0.726562
9,FisherMARENSLayers,0.726562


In [5]:
import ray

ModuleNotFoundError: No module named 'ray'

In [7]:
import pandas as pd
from llm_weaver import dict_overwrite, get_model_config, normalize_glue_task_name

model_id = "textAttack/roberta-base-RTE"


def FisherMARENSLayers(model_id):
    replacement_layers = [0, 1, 4, 11]
    for alpha in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1.0]:
        num_layers = get_model_config(model_id)["num_hidden_layers"]
        layer_assignments = [
            (
                {
                    "type": "ElementWiseLinearCombination",
                    "params": {
                        "donors": [
                            {
                                "donor": model_id,
                                "hidden_layer_number": i,
                                "weight": alpha,
                                "element_wise_multiplier_filename": f"../data/fisher_info/{model_id.replace('/', '_')}-fisher-info.h5",
                            },
                            {
                                "donor": "textAttack/roberta-base-MNLI",
                                "hidden_layer_number": i,
                                "weight": 1.0 - alpha,
                                "element_wise_multiplier_filename": "../data/fisher_info/textAttack_roberta-base-MNLI-fisher-info.h5",
                            },
                        ],
                        "normalize": True,
                    },
                }
                if (i in replacement_layers)
                else {
                    "type": "SingleLayer",
                    "params": {
                        "donor": model_id,
                        "hidden_layer_number": i,
                    },
                }
            )
            for i in range(num_layers)
        ]

        blank_model_config = dict_overwrite(
            get_model_config(model_id),
            {
                "num_hidden_layers": len(layer_assignments),
            },
        )
        config = {
            "glue_task": normalize_glue_task_name(model_id),
            "tokenizer_model_id": model_id,
            "blank_model_config": blank_model_config,
            "layer_assignments": layer_assignments,
            "classification_head": {
                "type": "SingleClassificationHead",
                "params": {
                    "donor": model_id,
                },
            },
            "embeddings": {
                "type": "SingleEmbeddings",
                "params": {
                    "donor": model_id,
                },
            },
        }

        yield config


weave_configs = list(FisherMARENSLayers(model_id))

scores = Parallel(n_jobs=5, return_as="list")(
    delayed(calculate_score_from_weaving_config_cached)(
        weave_config,
        # n_examples=4096,
        n_examples=128,
        split="train",
    )
    for weave_config in weave_configs
)
accuracies = [score["accuracy"] for score in scores]

records = []
for weave_config, accuracy in zip(weave_configs, accuracies):
    record = {}
    record["name"] = "FisherMARENSLayers"
    record["accuracy"] = accuracy
    records.append(record)
df_rte_vanilla = pd.DataFrame.from_records(records)
df_rte_vanilla

________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., n_examples=128, split='train')
________________________________________________________________________________

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-MNLI
Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-RTE
Loading textAttack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can 

_____________________________calculate_score_from_weaving_config - 36.1s, 0.6min
________________________________________________________________________________
[Memory] Calling llm_weaver.calculate_score_from_weaving_config...
calculate_score_from_weaving_config({ 'blank_model_config': { 'add_cross_attention': False,
                          'architectures': ['RobertaForSequenceClassification'],
                          'attention_probs_dropout_prob': 0.1,
                          'bad_words_ids': None,
                          'begin_suppress_tokens': None,
                          'bos_token_id': 0,
                          'chunk_size_feed_forward': 0,
                          'classifier_dropout': None,
                          'cross_attention_hidden_size': None,
                          'decoder_start_token_id': None,
                          'diversity_penalty': 0.0,
                          'do_sample': False,
                    ..., n_examples=128, split='train')

2023-11-28 16:22:52.340178: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 16:22:57.270929: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-11-28 16:22:57.863617: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. I

_____________________________calculate_score_from_weaving_config - 24.6s, 0.4min
_____________________________calculate_score_from_weaving_config - 27.8s, 0.5min
_____________________________calculate_score_from_weaving_config - 28.6s, 0.5min
_____________________________calculate_score_from_weaving_config - 28.8s, 0.5min
_____________________________calculate_score_from_weaving_config - 28.7s, 0.5min


Unnamed: 0,name,accuracy
0,FisherMARENSLayers,0.742188
1,FisherMARENSLayers,0.734375
2,FisherMARENSLayers,0.734375
3,FisherMARENSLayers,0.757812
4,FisherMARENSLayers,0.796875
5,FisherMARENSLayers,0.796875
6,FisherMARENSLayers,0.78125
7,FisherMARENSLayers,0.765625
8,FisherMARENSLayers,0.773438
9,FisherMARENSLayers,0.773438
