# End-to-end notebook!

Here is the workflow:
* Sampling configs (sampling parameters, etc.) lead to...
* Weaving configs (blank model settings, donor model settings, layer assignments) lead to...
* Models (probably TFRobertaForSequenceClassification in all cases) lead to...
* Performance scores (numbers from 0-100)

## Step 1: Sampling configs to weaving configs

In [None]:
# TODO

## Step 2: Weaving configs to models

The weaving config needs to define the following:
* `blank_model_config`
* `layer_assignments`
* `embedding_assignments`
* `classification_head_assignments`

In [54]:
# Some of the functions we need
from transformers.models.roberta.modeling_tf_roberta import TFRobertaForSequenceClassification
from transformers import RobertaConfig

def get_model(model_str):
    from transformers import TFRobertaForSequenceClassification
    model = TFRobertaForSequenceClassification.from_pretrained(model_str, from_pt=True)
    return model

def get_model_config(model_str):
    model = get_model(model_str)
    config = model.config.to_dict()
    del config["_name_or_path"]
    return config

def get_blank_model(config):
    blank_model = TFRobertaForSequenceClassification(
        RobertaConfig(**config)
    )
    blank_model.build()
    return blank_model

def dict_overwrite(d1, d2):
    d1 = d1.copy()
    for k in d2:
        d1[k] = d2[k]
    return d1



All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


True

In [77]:
# This defines a placeholder weaving_configs variable. It will be replaced by the actual weaving
# configs from the previous step.

import random
from random import randint
# Generate model weaving configuration


weaving_configs = [{
    # The task (i.e. the classification head output size should match the task at hand)
    "blank_model_config": dict_overwrite(
        get_model_config("textattack/roberta-base-RTE"),
        {
            "num_hidden_layers": 12,
        }
    ),
    # Layer assignments   
    "layer_assignments": [
        {
            "type": "SingleLayer",
            "params": {
                # Load donor model
                "donor": "textattack/roberta-base-RTE",
                # Pick a layer
                "hidden_layer_number": i,
            },
        } for i in range(12)
    ],
    # The head (i.e. the classification head should match the task at hand)
    # THESE ARE DIFFERENT BETWEEN RTE AND MNLI
    "classification_head": {
        "type": "SingleClassificationHead",
        "params": {
            "donor": "textattack/roberta-base-RTE",
        }
    },
    # The embeddings layer
    # THESE ARE DIFFERENT BETWEEN RTE AND MNLI
    "embeddings": {
        "type": "SingleEmbeddings",
        "params": {
            "donor": "textattack/roberta-base-RTE",
        }
    },

}]

# There's just one config in here
weaving_configs

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


[{'blank_model_config': {'return_dict': True,
   'output_hidden_states': False,
   'output_attentions': False,
   'torchscript': False,
   'torch_dtype': None,
   'use_bfloat16': False,
   'tf_legacy_loss': False,
   'pruned_heads': {},
   'tie_word_embeddings': True,
   'is_encoder_decoder': False,
   'is_decoder': False,
   'cross_attention_hidden_size': None,
   'add_cross_attention': False,
   'tie_encoder_decoder': False,
   'max_length': 20,
   'min_length': 0,
   'do_sample': False,
   'early_stopping': False,
   'num_beams': 1,
   'num_beam_groups': 1,
   'diversity_penalty': 0.0,
   'temperature': 1.0,
   'top_k': 50,
   'top_p': 1.0,
   'typical_p': 1.0,
   'repetition_penalty': 1.0,
   'length_penalty': 1.0,
   'no_repeat_ngram_size': 0,
   'encoder_no_repeat_ngram_size': 0,
   'bad_words_ids': None,
   'num_return_sequences': 1,
   'chunk_size_feed_forward': 0,
   'output_scores': False,
   'return_dict_in_generate': False,
   'forced_bos_token_id': None,
   'forced_eos_tok

In [89]:
# The functions we need for model weaving

from transformers import TFAutoModelForSequenceClassification, AutoTokenizer

def get_model_and_tokenizer(identifier):
    tokenizer = AutoTokenizer.from_pretrained(identifier)
    model = TFAutoModelForSequenceClassification.from_pretrained(identifier, from_pt=True)
    return model, tokenizer

def _get_layer_to_weights_map(model):
    from collections import defaultdict
    import re

    layer_to_weights_map = defaultdict(dict)
    for weight in model.weights:
        matches = re.findall(r'/layer_._(\d+)/', weight.name)
        if not matches:
            continue

        layer_number = int(matches[0])
        layer_to_weights_map[layer_number][weight.name.partition(f"/layer_._{layer_number}/")[-1]] = weight

    return {
        layer_number: dict(weights)
        for layer_number, weights in layer_to_weights_map.items()
    }
    
def assign_weights_from_one_layer_to_another(source_model, target_model, source_layer_number, target_layer_number):

    # This part is recalculated often, but it's fast. In the future we could 
    # cache it in a class as a cached property, but we'll leave it here for now.
    target_model_layer_to_weights_map = _get_layer_to_weights_map(target_model)
    source_model_layer_to_weights_map = _get_layer_to_weights_map(source_model)

    # Get the layer objects
    source_layer = source_model_layer_to_weights_map[source_layer_number]
    target_layer = target_model_layer_to_weights_map[target_layer_number]

    # Make sure that all the suffixes match
    assert set(source_layer.keys()) == set(target_layer.keys())

    # Make sure that all the shapes match
    for weight_name, weight_object in source_layer.items():
        assert weight_object.shape == target_layer[weight_name].shape
    
    # Assign weights from one layer to another
    for weight_name, weight_object in source_layer.items():
        target_layer[weight_name].assign(weight_object.numpy())


def weave_models(blank_model_config, layer_assignments, classification_head=None, embeddings=None):
    # Create a blank model
    target_model = get_blank_model(blank_model_config)

    # We gather all the names of the donor models we need to load
    source_model_names = set(
        layer_assignment["params"]["donor"]
        for layer_assignment in layer_assignments
    )
    if classification_head is not None:
        source_model_names.add(classification_head["params"]["donor"])
    if embeddings is not None:
        source_model_names.add(embeddings["params"]["donor"])

    # We load all the donor models into a dictionary for easy access
    source_models = {}
    for source_model_name in source_model_names:
        print(f"Loading {source_model_name}")
        source_models[source_model_name] = get_model(source_model_name)

    for layer_assignment in layer_assignments:
        if layer_assignment["type"] == "SingleLayer":
            assign_weights_from_one_layer_to_another(
                source_model=source_models[layer_assignment["params"]["donor"]],
                target_model=target_model,
                source_layer_number=layer_assignment["params"]["hidden_layer_number"],
                target_layer_number=layer_assignment["params"]["hidden_layer_number"]
            )
        else:
            raise NotImplementedError(f"Unknown layer assignment type: {layer_assignment['type']}")

    if classification_head is not None:
        if classification_head["type"] == "SingleClassificationHead":
            # We want to copy weights from the donor model to the target model. There are four parts.
            # tf_roberta_for_sequence_classification_18/classifier/dense/kernel:0 (768, 768)
            # tf_roberta_for_sequence_classification_18/classifier/dense/bias:0 (768,)
            # tf_roberta_for_sequence_classification_18/classifier/out_proj/kernel:0 (768, 3)
            # tf_roberta_for_sequence_classification_18/classifier/out_proj/bias:0 (3,)
            # They live in source_models[classification_head["params"]["donor"]].classifier.weights
            # and need to go to target_model.classifier.weights
            # using something like target_weight.assign(source_weight.numpy())
            raise NotImplementedError("TODO: Kirthi")
        else:
            raise NotImplementedError(f"Unknown classification head type: {classification_head['type']}")
        
    if embeddings is not None:
        if embeddings["type"] == "SingleEmbeddings":
            # We want to copy weights from the donor model to the target model. There are five parts.
            # tf_roberta_for_sequence_classification_18/roberta/embeddings/word_embeddings/weight:0 (50265, 768)
            # tf_roberta_for_sequence_classification_18/roberta/embeddings/token_type_embeddings/embeddings:0 (1, 768)
            # tf_roberta_for_sequence_classification_18/roberta/embeddings/position_embeddings/embeddings:0 (514, 768)
            # tf_roberta_for_sequence_classification_18/roberta/embeddings/LayerNorm/gamma:0 (768,)
            # tf_roberta_for_sequence_classification_18/roberta/embeddings/LayerNorm/beta:0 (768,)
            # They live in source_models[embeddings["params"]["donor"]].roberta.embeddings.weights
            # and need to go to target_model.roberta.embeddings.weights
            # using something like target_weight.assign(source_weight.numpy())
            raise NotImplementedError("TODO: Kirthi")
        else:
            raise NotImplementedError(f"Unknown embeddings type: {embeddings['type']}")

    return target_model


    

weaved_models = (
    weave_models(**weaving_config)
    for weaving_config in weaving_configs
)
model = next(weaved_models)
model.summary()

Loading textattack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


NotImplementedError: TODO: Kirthi

In [95]:
# Kirthi: you can work with the models like this 👀

model_mnli = get_model("textattack/roberta-base-MNLI")
blank_model = get_blank_model(get_model_config("textattack/roberta-base-MNLI"))

print("the names of the variables we want to copy. Hope they are the same")
print(model_mnli.roberta.embeddings.weights[0].name)  # tf_roberta_for_sequence_classification_108/roberta/embeddings/word_embeddings/weight:0
print(blank_model.roberta.embeddings.weights[0].name)  # tf_roberta_for_sequence_classification_110/roberta/embeddings/word_embeddings/weight:0

print("Before")
print(blank_model.roberta.embeddings.weights[0].numpy()[0][0]) # -0.007996951

# Copy the weights from the MNLI model to the blank model, 
blank_model.roberta.embeddings.weights[0].assign(model_mnli.roberta.embeddings.weights[0].numpy())

print("After")
print(blank_model.roberta.embeddings.weights[0].numpy()[0][0]) # 0.14725289

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


the names of the variables we want to copy. Hope they are the same
tf_roberta_for_sequence_classification_108/roberta/embeddings/word_embeddings/weight:0
tf_roberta_for_sequence_classification_110/roberta/embeddings/word_embeddings/weight:0
Before
-0.007996951
After
0.14725289


## Step 3: Models to scores

In [16]:
import tensorflow as tf
from absl import app, flags
from transformers.data.processors import glue as hf_glue
# Example import statement assuming 'evaluation' is a module within 'model_merging'

FLAGS = flags.FLAGS

#flags.DEFINE_string("glue_task_weave_model", "rte", "GLUE task for evaluation")
#flags.DEFINE_string("split_weave_model", "validation", "Data split for evaluation")
#flags.DEFINE_integer("n_examples_weave_model", 1000, "Number of examples to evaluate")
#flags.DEFINE_integer("batch_size_weave_model", 32, "Batch size for evaluation")
#flags.DEFINE_integer("sequence_length_weave_model", 128, "Maximum sequence length")
#flags.DEFINE_string("favor_target_model_weave_model", "accuracy", "Favor target model based on the metric (e.g., 'accuracy')")
#flags.DEFINE_boolean("normalize_fishers_weave_model", True, "Normalize Fisher scores")

import os
import numpy as np

#from model_merging import data
from model_merging.model_merging import evaluation
#from model_merging.model_merging.evaluation import load_metric_for_glue_task, evaluate_model
#from model_merging.evaluation import load_metric_for_glue_task, evaluate_model

#from model_merging import hdf5_util
#from model_merging import merging

def _to_tfds_task_name(task, split):
    if task == "sts-b":
        task = "stsb"
    elif task == "sst-2":
        task = "sst2"
    elif task == "mnli" and split != "train":
        task = "mnli_matched"
    elif task == "mnli-mm" and split != "train":
        task = "mnli_mismatched"
    return task

_STSB_MIN = 0
_STSB_MAX = 5
_STSB_NUM_BINS = 5 * (_STSB_MAX - _STSB_MIN)

def _convert_dataset_to_features(
    dataset,
    tokenizer,
    max_length,
    task,
):
    """Note that this is only for single examples; won't work with batched inputs."""
    pad_token = tokenizer.pad_token_id
    # NOTE: Not sure if this is correct, but it matches up for BERT. RoBERTa does
    # not appear to use token types
    pad_token_segment_id = tokenizer.pad_token_type_id
    _glue_processors = hf_glue.glue_processors
    _glue_output_modes = hf_glue.glue_output_modes
    processor = _glue_processors[task]()
    output_mode = _glue_output_modes[task]

    if task == "sts-b":
        # STS-B regression
        stsb_bins = np.linspace(_STSB_MIN, _STSB_MAX, num=_STSB_NUM_BINS + 1)
        stsb_bins = stsb_bins[1:-1]
    else:
        label_list = processor.get_labels()
        label_map = {label: i for i, label in enumerate(label_list)}


def load_glue_dataset(task: str, split: str, tokenizer, max_length: int):
    tfds_task = _to_tfds_task_name(task, split)
    ds = tf.load(f"glue/{tfds_task}", split=split)
    ds = _convert_dataset_to_features(
        ds,
        tokenizer,
        max_length,
        task,
    )
    return ds

def main(_):
    # Load the weaved model
    weaved_model = TFRobertaForSequenceClassification.from_pretrained('weaved_model')

    # Load the dataset
    ds = load_glue_dataset(
        task=FLAGS.glue_task,
        split=FLAGS.split,
        tokenizer=weaved_model.tokenizer,
        max_length=FLAGS.sequence_length,
    )
    ds = ds.take(FLAGS.n_examples).batch(FLAGS.batch_size)

    # Load metrics
    metric = evaluation.load_metric_for_glue_task(FLAGS.glue_task)

    # Evaluate the weaved model
    results = evaluation.evaluate_model(weaved_model, ds, metric)

    # Print evaluation results
    print(80 * "*")
    print(" Weaved Model Evaluation")
    print(80 * "*")
    print(f"{FLAGS.glue_task} {FLAGS.split} {FLAGS.n_examples} Examples")
    print(80 * "-")
    print(f"Metric: {FLAGS.favor_target_model}")
    print(f"{metric.name}: {results[FLAGS.favor_target_model]}")

#if __name__ == "__main__":
#    app.run(main)

ModuleNotFoundError: No module named 'model_merging'