# set up blank model

In [1]:
pip install transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
# OLD
import json
# create target model configuration
target_model_config = {
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": None,
  "eos_token_id": 2,
  "finetuning_task": "glue:rte",
  "gradient_checkpointing": False,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 15,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.0",
  "type_vocab_size": 1,
  "use_cache": True,
  "vocab_size": 50265
}
#create blank model
from transformers.models.roberta.modeling_tf_roberta import TFRobertaForSequenceClassification
from transformers import RobertaConfig
#define donor model
donor_model_for_classification = TFRobertaForSequenceClassification.from_pretrained("textattack/roberta-base-MNLI", from_pt=True)
new_config = RobertaConfig(**target_model_config)
blank_model = TFRobertaForSequenceClassification(new_config)
blank_model.build()



All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [3]:
#NEW
import json
from transformers import RobertaForSequenceClassification, RobertaConfig
from transformers.models.roberta.modeling_tf_roberta import TFRobertaForSequenceClassification
import random
# Create target model configuration
target_model_config = {
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": None,
  "eos_token_id": 2,
  "finetuning_task": "glue:rte",
  "gradient_checkpointing": False,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 15,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.0",
  "type_vocab_size": 1,
  "use_cache": True,
  "vocab_size": 50265
}

# Load the donor model with PyTorch weights
donor_model_name = random.choice(["textattack/roberta-base-MNLI", "textattack/roberta-base-RTE"])
donor_model_for_classification_head = TFRobertaForSequenceClassification.from_pretrained(donor_model_name, from_pt=True)

# Create the blank model
new_config = RobertaConfig(**target_model_config)
blank_model = TFRobertaForSequenceClassification(new_config)
blank_model.build()

# Explore the model structure to find the correct layer
print(blank_model.summary())


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Model: "tf_roberta_for_sequence_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  145318656 
 r)                                                              
                                                                 
 classifier (TFRobertaClass  multiple                  592130    
 ificationHead)                                                  
                                                                 
Total params: 145910786 (556.61 MB)
Trainable params: 145910786 (556.61 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [4]:
blank_model.save_pretrained('blank_model')


# define weave pattern

In [5]:
import random
from random import randint
# Generate model weaving configuration
model_weaving_config = {
    # The task (i.e. the classification head should match the task at hand)
    "target_model_template": "./blank_model",
    # Layer assignments
    "layer_assignments": [
        {
            "type": "SingleLayer",
            "params": {
                # Load donor model && Pick a model randomly, p=0.5
                "donor": ["textattack/roberta-base-MNLI",
                          "textattack/roberta-base-RTE"][randint(0, 1)],
                # Pick a layer within [i-1,i+1], keeping it between 0 and 11
                "hidden_layer_number": min(11, max(0, randint(i - 1,i + 1))),
            },
        } for i in range(15)
    ],
}

model_weaving_config

{'target_model_template': './blank_model',
 'layer_assignments': [{'type': 'SingleLayer',
   'params': {'donor': 'textattack/roberta-base-RTE',
    'hidden_layer_number': 0}},
  {'type': 'SingleLayer',
   'params': {'donor': 'textattack/roberta-base-MNLI',
    'hidden_layer_number': 1}},
  {'type': 'SingleLayer',
   'params': {'donor': 'textattack/roberta-base-RTE',
    'hidden_layer_number': 1}},
  {'type': 'SingleLayer',
   'params': {'donor': 'textattack/roberta-base-MNLI',
    'hidden_layer_number': 3}},
  {'type': 'SingleLayer',
   'params': {'donor': 'textattack/roberta-base-RTE',
    'hidden_layer_number': 4}},
  {'type': 'SingleLayer',
   'params': {'donor': 'textattack/roberta-base-RTE',
    'hidden_layer_number': 5}},
  {'type': 'SingleLayer',
   'params': {'donor': 'textattack/roberta-base-RTE',
    'hidden_layer_number': 5}},
  {'type': 'SingleLayer',
   'params': {'donor': 'textattack/roberta-base-RTE',
    'hidden_layer_number': 8}},
  {'type': 'SingleLayer',
   'params':

# Weave

In [6]:
from llm_weaver.weave import weave_models

weaved_model = weave_models(target_model_template=blank_model, layer_assignments=model_weaving_config["layer_assignments"])

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [7]:
weaved_model.save_pretrained('weaved_model')

# Evaluate

In [8]:
pip install tensorflow_datasets

[0mNote: you may need to restart the kernel to use updated packages.


In [9]:
import sys
sys.path.append('../')
import tensorflow as tf
from absl import app, flags
from transformers.data.processors import glue as hf_glue
# Example import statement assuming 'evaluation' is a module within 'model_merging'

FLAGS = flags.FLAGS

#flags.DEFINE_string("glue_task_weave_model", "rte", "GLUE task for evaluation")
#flags.DEFINE_string("split_weave_model", "validation", "Data split for evaluation")
#flags.DEFINE_integer("n_examples_weave_model", 1000, "Number of examples to evaluate")
#flags.DEFINE_integer("batch_size_weave_model", 32, "Batch size for evaluation")
#flags.DEFINE_integer("sequence_length_weave_model", 128, "Maximum sequence length")
#flags.DEFINE_string("favor_target_model_weave_model", "accuracy", "Favor target model based on the metric (e.g., 'accuracy')")
#flags.DEFINE_boolean("normalize_fishers_weave_model", True, "Normalize Fisher scores")

import os
import numpy as np

#from model_merging import data
from model_merging.model_merging import evaluation
from model_merging.model_merging import data
flags.DEFINE_string("glue_task", None, "")
#from model_merging.model_merging.evaluation import load_metric_for_glue_task, evaluate_model
#from model_merging.evaluation import load_metric_for_glue_task, evaluate_model

#from model_merging import hdf5_util
#from model_merging import merging

def _to_tfds_task_name(task, split):
    if task == "sts-b":
        task = "stsb"
    elif task == "sst-2":
        task = "sst2"
    elif task == "mnli" and split != "train":
        task = "mnli_matched"
    elif task == "mnli-mm" and split != "train":
        task = "mnli_mismatched"
    return task

_STSB_MIN = 0
_STSB_MAX = 5
_STSB_NUM_BINS = 5 * (_STSB_MAX - _STSB_MIN)

def _convert_dataset_to_features(
    dataset,
    tokenizer,
    max_length,
    task,
):
    """Note that this is only for single examples; won't work with batched inputs."""
    pad_token = tokenizer.pad_token_id
    # NOTE: Not sure if this is correct, but it matches up for BERT. RoBERTa does
    # not appear to use token types
    pad_token_segment_id = tokenizer.pad_token_type_id
    _glue_processors = hf_glue.glue_processors
    _glue_output_modes = hf_glue.glue_output_modes
    processor = _glue_processors[task]()
    output_mode = _glue_output_modes[task]

    if task == "sts-b":
        # STS-B regression
        stsb_bins = np.linspace(_STSB_MIN, _STSB_MAX, num=_STSB_NUM_BINS + 1)
        stsb_bins = stsb_bins[1:-1]
    else:
        label_list = processor.get_labels()
        label_map = {label: i for i, label in enumerate(label_list)}


def load_glue_dataset(task: str, split: str, tokenizer, max_length: int):
    tfds_task = _to_tfds_task_name(task, split)
    ds = tf.load(f"glue/{tfds_task}", split=split)
    ds = _convert_dataset_to_features(
        ds,
        tokenizer,
        max_length,
        task,
    )
    return ds

def main():
    # Load the weaved model
    weaved_model = TFRobertaForSequenceClassification.from_pretrained('weaved_model')
    print('hi')
    # Load the dataset
    ds = data.load_glue_dataset(
        task=FLAGS.glue_task,
        split=FLAGS.split,
        tokenizer=weaved_model.tokenizer,
        max_length=FLAGS.sequence_length,
    )
    ds = ds.take(FLAGS.n_examples).batch(FLAGS.batch_size)

    # Load metrics
    metric = evaluation.load_metric_for_glue_task(FLAGS.glue_task)

    # Evaluate the weaved model
    results = evaluation.evaluate_model(weaved_model, ds, metric)

    # Print evaluation results
    print(80 * "*")
    print(" Weaved Model Evaluation")
    print(80 * "*")
    print(f"{FLAGS.glue_task} {FLAGS.split} {FLAGS.n_examples} Examples")
    print(80 * "-")
    print(f"Metric: {FLAGS.favor_target_model}")
    print(f"{metric.name}: {results[FLAGS.favor_target_model]}")

main()
#if __name__ == "__main__":
#    app.run(main)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at weaved_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


hi


UnparsedFlagAccessError: Trying to access flag --glue_task before flags were parsed.