# Test the weaving code on the base models

In [1]:
# install dependencies

! pip install -q joblib  # joblib for memoizing functions
! pip install -q ipywidgets widgetsnbextension pandas-profiling # IProgress for progress bars

zsh:1: /home/brian/2023-fall-cs-194-294-merging-llms/.venv/bin/pip: bad interpreter: .venv/bin/python3.8: no such file or directory
zsh:1: /home/brian/2023-fall-cs-194-294-merging-llms/.venv/bin/pip: bad interpreter: .venv/bin/python3.8: no such file or directory


In [2]:
# Add model_merging to the python path

import os
import sys

model_merging_base = os.path.abspath("../model_merging/")
# assert it exist
assert os.path.exists(model_merging_base)
if model_merging_base not in sys.path:
    sys.path.append(model_merging_base)

In [3]:
# import joblib for caching and distributed computing
from math import sqrt

from joblib import Memory, Parallel, delayed

# memory = Memory(location="cache", verbose=10)
memory = Memory(location="cache", verbose=0)

parallel = Parallel(n_jobs=2, return_as="generator")
output_generator = parallel(delayed(sqrt)(i**2) for i in range(10))

In [4]:
# Imports and cached functions

import os

from llm_weaver import (
    calculate_score_from_weaving_config,
    get_score_from_named_model,
    test_weaver,
)

# Disable parallelism in tokenizers to avoid deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

calculate_score_from_weaving_config_cached = memory.cache(
    calculate_score_from_weaving_config
)
test_weaver_cached = memory.cache(test_weaver)

get_score_from_named_model_cached = memory.cache(get_score_from_named_model)

2023-11-27 14:03:50.012065: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-27 14:03:50.046002: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-27 14:03:50.046591: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Make sure you can build using `.build()`

In [5]:
import transformers
from llm_weaver import get_blank_model, get_model_config

if transformers.__version__ < "4.3.1":
    raise ValueError(
        "Need transformers >= 4.3.1, or something like that. Not sure of the version."
    )
    # https://github.com/huggingface/transformers/commit/4a55e4787760fdb6c40a972a60d814ba05425da1#diff-648ec06beb5ae6380c7f611a0f513a5d392509497d245a09f06b6549358afdffR1151

print(f"You have transformers version {transformers.__version__}!")

model = get_blank_model(get_model_config("textattack/roberta-base-RTE"))
model.build()

type(model)

You have transformers version 4.35.0!


2023-11-27 14:03:53.514819: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-27 14:03:53.515609: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, 

transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification

## Step 0: Test weaving code

This test makes sure that our score when using the weaver to reconstruct a model from all its parts get the same evaluation score as the original model

In [6]:
import pandas as pd

model_ids = [
    "textattack/roberta-base-RTE",
    "textattack/roberta-base-MNLI",  # <--- this one has a very low score
    # "howey/roberta-large-rte",
    # "howey/roberta-large-mnli",
    # "howey/roberta-large-qnli",
    # "howey/roberta-large-sst2",
    # "howey/roberta-large-cola",
    # "howey/roberta-large-mrpc",
    # "howey/roberta-large-qqp",
    # "howey/roberta-large-stsb", # <--- did not work
    # "JeremiahZ/roberta-base-rte",
    # "JeremiahZ/roberta-base-mnli",
    # "JeremiahZ/roberta-base-qnli",
    # "JeremiahZ/roberta-base-sst2",
    # "JeremiahZ/roberta-base-cola",
    # "JeremiahZ/roberta-base-mrpc",
    # "JeremiahZ/roberta-base-qqp",
    # "JeremiahZ/roberta-base-stsb", # <--- did not work
    # "l-yohai/bigbird-roberta-base-mnli",
    # "howey/roberta-large-squad2",
]
# textattack/roberta-base-RTE ({'accuracy': 0.7}, {'accuracy': 0.7})
# textattack/roberta-base-MNLI ({'accuracy': 0.3}, {'accuracy': 0.3})
# howey/roberta-large-rte ({'accuracy': 0.65}, {'accuracy': 0.65})
# howey/roberta-large-mnli ({'accuracy': 0.68}, {'accuracy': 0.68})
# howey/roberta-large-qnli ({'accuracy': 0.86}, {'accuracy': 0.86})
# howey/roberta-large-sst2 ({'accuracy': 0.77}, {'accuracy': 0.77})
# howey/roberta-large-cola ({'matthews_correlation': 0.19169538058831714}, {'matthews_correlation': 0.19169538058831714})
# howey/roberta-large-mrpc ({'accuracy': 0.61, 'f1': 0.6486486486486487}, {'accuracy': 0.61, 'f1': 0.6486486486486487})
# howey/roberta-large-qqp ({'accuracy': 0.77, 'f1': 0.5490196078431372}, {'accuracy': 0.77, 'f1': 0.5490196078431372})
# JeremiahZ/roberta-base-rte ({'accuracy': 0.61}, {'accuracy': 0.61})
# JeremiahZ/roberta-base-mnli ({'accuracy': 0.83}, {'accuracy': 0.83})
# JeremiahZ/roberta-base-qnli ({'accuracy': 0.82}, {'accuracy': 0.82})
# JeremiahZ/roberta-base-sst2 ({'accuracy': 0.89}, {'accuracy': 0.89})
# JeremiahZ/roberta-base-cola ({'matthews_correlation': 0.15285569591066622}, {'matthews_correlation': 0.15285569591066622})
# JeremiahZ/roberta-base-mrpc ({'accuracy': 0.33, 'f1': 0.10666666666666666}, {'accuracy': 0.33, 'f1': 0.10666666666666666})
# JeremiahZ/roberta-base-qqp ({'accuracy': 0.79, 'f1': 0.7341772151898734}, {'accuracy': 0.79, 'f1': 0.7341772151898734})


records = []
for model_id in model_ids:
    records.append(dict(model_id=model_id, results=str(test_weaver(model_id))))

weaving_test_results_df = pd.DataFrame(records)
# weaving_test_results_df.to_csv("test-weaving-on-base-models.weaving_test_results.csv")
weaving_test_results_df

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


calculating score for weaving config md5sum: 3f302d2841d5e9508a8e33464b313985
Loading textattack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
  return hfds.load_metric("glue", task)
2023-11-27 14:04:02.610929: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


calculating score for weaving config md5sum: 64b8d72806f9123d7a952fb0df71b630
Loading textattack/roberta-base-RTE


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
2023-11-27 14:04:14.396119: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to 

Original score (textattack/roberta-base-RTE): {'accuracy': 0.7}
Weaved score (textattack/roberta-base-RTE): {'accuracy': 0.7}
Linear combo weaved score (textattack/roberta-base-RTE): {'accuracy': 0.7}


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


calculating score for weaving config md5sum: c87dd0438cfc04d8acd9371cc8fb05f9
Loading textattack/roberta-base-MNLI


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
2023-11-27 14:04:41.125919: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


calculating score for weaving config md5sum: 2de68b10f030533e464532058b3b6081
Loading textattack/roberta-base-MNLI


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
2023-11-27 14:04:53.160203: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to 

Original score (textattack/roberta-base-MNLI): {'accuracy': 0.3}
Weaved score (textattack/roberta-base-MNLI): {'accuracy': 0.3}
Linear combo weaved score (textattack/roberta-base-MNLI): {'accuracy': 0.3}


Unnamed: 0,model_id,results
0,textattack/roberta-base-RTE,"[{'accuracy': 0.7}, {'accuracy': 0.7}, {'accur..."
1,textattack/roberta-base-MNLI,"[{'accuracy': 0.3}, {'accuracy': 0.3}, {'accur..."
