# Experiments
We check where typological grouping can be most effective,
and which type of typological grouping works best.

1. Grouping in function aggregation
- Phylogeny OR typology inspired stacks
- Greater dataset for adapter training
2. Parameter aggregation
- Arithmetic: typology-informed weights for aggregation
3. Representation aggregation
- A bit what EMEA does, not efficient at inference time
    - EMEA even worse as they "learn" at inference

## 1. Stacks
Train a joint language adapter on a group of languages through MLM
Here, a distinction could still be made between:
- training jointly, no stack
    - equal presence of all languages
    - weighted presence of all languages
- training jointly in a stack with target language adapter on top
    - e.g. We already have a "Romance" adapter, train "Asturian" adapter on top of this
- training jointly with a *changing stack*, activating the adapter for the language batch
    - What Faisal does?
# 2. Parameter aggregation
Arithmetic operations on adapters:
- adding existing adapters and compare with jointly trained family adapters
    - "average" of adapters == jointly trained? (cf. Linear mode connectivity)
- re-creating typological profile of a language
    - preparation step to then "fine-tune" on little data (typologically inspired initialization)



In [17]:
from adapters import AutoAdapterModel, Stack


model = AutoAdapterModel.from_pretrained("xlm-roberta-base")
# we load in two adapters
model.load_adapter("./trained_adapters/mono/de", load_as="de")
model.load_adapter("./trained_adapters/mono/en", load_as="en")
# model.load_adapter("./trained_adapters/family/en-de-nl-af/mlm", load_as="fam")

model.active_adapters = Stack("de", "en")

Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import re

sd = model.state_dict()
organized_layers = {}
# for each layer:
# group 1: layer number
# group 2: adapter name
# group 3: projection
# group 4: projection weight/bias
pattern = "roberta\.encoder\.layer\.([\d\w]+)\.output\.adapters\.(\w+)\.(\w+)(?:\.0)?\.(\w+)"

inv_adapters = {}
# For invertible adapters
# group 1: adapter name
# group 2: F/G identifier
# group 3: 0/2 layer number
# group 4: projection weight/bias
inv_pattern = "roberta\.invertible_adapters\.(\w+)\.(\w+)\.(\d)\.(\w+)"
for key in model.state_dict().keys():
    match = re.search(pattern, key)
    if match:
        layer_num = str(match.group(1))
        if layer_num not in organized_layers:
            organized_layers[layer_num] = {}
        adapter_name = match.group(2)
        projection = match.group(3)
        projection_type = match.group(4)
        # print(f"Layer: {layer_num}, Adapter: {adapter_name}, Projection: {projection}, Type: {projection_type}")
        if projection not in organized_layers[layer_num]:
            organized_layers[layer_num][projection] = {}
        if projection_type not in organized_layers[layer_num][projection]:
            organized_layers[layer_num][projection][projection_type] = []
        organized_layers[layer_num][projection][projection_type].append(key)
    inv_match = re.search(inv_pattern, key)
    if inv_match:
        adapter_name = inv_match.group(1)
        identifier = inv_match.group(2)
        layer_num = inv_match.group(3)
        projection_type = inv_match.group(4)
        if identifier not in inv_adapters:
            inv_adapters[identifier] = {}
        if layer_num not in inv_adapters[identifier]:
            inv_adapters[identifier][layer_num] = {}
        if projection_type not in inv_adapters[identifier][layer_num]:
            inv_adapters[identifier][layer_num][projection_type] = []
        inv_adapters[identifier][layer_num][projection_type].append(key)

In [31]:
from collections import OrderedDict

# we now average the weights and biases of all layers over all adapters
new_state_dict = OrderedDict()
# to ensure we don't get problems, we check the config of all adapters
all_adapters = model.active_adapters
config_id = model.adapters_config.adapters[all_adapters[0]]
config = model.adapters_config.config_map[config_id]
for i in range(1, len(all_adapters)):
    config_id = model.adapters_config.adapters[all_adapters[i]]
    config_i = model.adapters_config.config_map[config_id]
    assert config == config_i, (
        f"Config mismatch: {config} vs {config_i}\nCurrent methodology only works for same config"
    )

# if no problem, we go to the next step
for layer_num, projections in organized_layers.items():
    for projection, types in projections.items():
        for projection_type, keys in types.items():
            if projection_type == "weight":
                # average the weights
                # avg_weight = sum([sd[key] for key in keys]) / len(keys)
                # test: 2/3 "en", 1/3 "de"
                avg_weight = (2 / 3) * sd[keys[0]] + (1 / 3) * sd[keys[1]]
                # print(f"Layer: {layer_num}, Projection: {projection}, Type: {projection_type}, Avg. Weight Shape: {avg_weight.shape}")
                if projection == "adapter_down":
                    new_state_dict[
                        f"roberta.encoder.layer.{layer_num}.output.adapters.joined_adapter.{projection}.0.weight"
                    ] = avg_weight
                else:
                    new_state_dict[
                        f"roberta.encoder.layer.{layer_num}.output.adapters.joined_adapter.{projection}.weight"
                    ] = avg_weight

            if projection_type == "bias":
                # average the biases
                # avg_bias = sum([sd[key] for key in keys]) / len(keys)
                # test: 2/3 "en", 1/3 "de"
                avg_bias = (2 / 3) * sd[keys[0]] + (1 / 3) * sd[keys[1]]
                # print(f"Layer: {layer_num}, Projection: {projection}, Type: {projection_type}, Avg. Bias Shape: {avg_bias.shape}")
                if projection == "adapter_down":
                    new_state_dict[
                        f"roberta.encoder.layer.{layer_num}.output.adapters.joined_adapter.{projection}.bias"
                    ] = avg_bias
                else:
                    new_state_dict[
                        f"roberta.encoder.layer.{layer_num}.output.adapters.joined_adapter.{projection}.0.bias"
                    ] = avg_bias
for identifier, layer_num in inv_adapters.items():
    for layer_num, projections in layer_num.items():
        for projection_type, keys in projections.items():
            if projection_type == "weight":
                # average the weights
                # avg_weight = sum([sd[key] for key in keys]) / len(keys)
                # test: 2/3 "en", 1/3 "de"
                avg_weight = (2 / 3) * sd[keys[0]] + (1 / 3) * sd[keys[1]]
                # print(f"Layer: {layer_num}, Projection: {projection}, Type: {projection_type}, Avg. Weight Shape: {avg_weight.shape}")
                new_state_dict[f"roberta.invertible_adapters.{identifier}.F.{layer_num}.{projection_type}"] = avg_weight
            if projection_type == "bias":
                # average the biases
                # avg_bias = sum([sd[key] for key in keys]) / len(keys)
                # test: 2/3 "en", 1/3 "de"
                avg_bias = (2 / 3) * sd[keys[0]] + (1 / 3) * sd[keys[1]]
                # print(f"Layer: {layer_num}, Projection: {projection}, Type: {projection_type}, Avg. Bias Shape: {avg_bias.shape}")
                new_state_dict[f"roberta.invertible_adapters.{identifier}.G.{layer_num}.{projection_type}"] = avg_bias

In [32]:
# we have config saved from the last step, we create a new one in the same form
if "joined_adapter" in model.adapters_config.adapters.keys():
    # remove the old one
    model.delete_adapter("joined_adapter")
model.add_adapter("joined_adapter_v2", config=config)

In [33]:
for name, param in model.named_parameters():
    # e.g. "roberta.encoder.layer.0.output.adapters.joined_adapter.adapter_down.0.weight"
    if "joined_adapter_v2" in name and name in new_state_dict:
        param.data.copy_(new_state_dict[name])

In [27]:
model.roberta.invertible_adapters

ModuleDict(
  (de): NICECouplingBlock(
    (F): Sequential(
      (0): Linear(in_features=384, out_features=192, bias=True)
      (1): Activation_Function_Class(
        (f): ReLU()
      )
      (2): Linear(in_features=192, out_features=384, bias=True)
    )
    (G): Sequential(
      (0): Linear(in_features=384, out_features=192, bias=True)
      (1): Activation_Function_Class(
        (f): ReLU()
      )
      (2): Linear(in_features=192, out_features=384, bias=True)
    )
  )
  (en): NICECouplingBlock(
    (F): Sequential(
      (0): Linear(in_features=384, out_features=192, bias=True)
      (1): Activation_Function_Class(
        (f): ReLU()
      )
      (2): Linear(in_features=192, out_features=384, bias=True)
    )
    (G): Sequential(
      (0): Linear(in_features=384, out_features=192, bias=True)
      (1): Activation_Function_Class(
        (f): ReLU()
      )
      (2): Linear(in_features=192, out_features=384, bias=True)
    )
  )
  (joined_adapter): NICECouplingBlock(
   

In [34]:
model.save_adapter("./trained_adapters/mono/joined_adapter_v2", "joined_adapter_v2")

In [29]:
# we evaluated the adapter (along with de and en) on ner in another script
import json

results = json.load(open("methods/eval_dict_joined.json"))

In [30]:
for (name, de), (_, en), (_, joined) in zip(
    results["de"].items(), results["en"].items(), results["joined_adapter"].items()
):
    print(f"{name}, avg en/de: {(en + de) / 2}, joined: {joined}")

eval_loss, avg en/de: 0.467184379696846, joined: 0.4572905898094177
eval_model_preparation_time, avg en/de: 0.0086, joined: 0.006
eval_precision, avg en/de: 0.5284809848704373, joined: 0.5575268817204301
eval_recall, avg en/de: 0.7186234817813766, joined: 0.6997300944669366
eval_f1, avg en/de: 0.6085899656003557, joined: 0.6205864751645721
eval_accuracy, avg en/de: 0.8567322573513155, joined: 0.8606566438204731
eval_runtime, avg en/de: 4.626099999999999, joined: 4.6817
eval_samples_per_second, avg en/de: 216.16500000000002, joined: 213.599
eval_steps_per_second, avg en/de: 27.0205, joined: 26.7


# Copying the approach from language-arithmetic

In [46]:
import copy
import torch.nn as nn

model1 = copy.deepcopy(model.cpu())
model2 = copy.deepcopy(model.cpu())
lang1 = "de"
lang2 = "en"


tgt = "j_adapter"


layers1 = model1.roberta.encoder.layer
layers2 = model2.roberta.encoder.layer
for l1, l2 in zip(layers1, layers2):
    print(l1, "\n~\n", l2, "\n" + "-" * 50)
    adapter1 = l1.output.adapters[lang1]
    adapter2 = l2.output.adapters[lang2]
    l1.output.adapters = nn.ModuleDict()
    l2.output.adapters = nn.ModuleDict()
    l1.output.adapters[tgt] = adapter1
    l2.output.adapters[tgt] = adapter2

XLMRobertaLayer(
  (attention): XLMRobertaAttention(
    (self): XLMRobertaSdpaSelfAttentionWithAdapters(
      (query): LoRALinearTorch(
        in_features=768, out_features=768, bias=True
        (loras): ModuleDict()
      )
      (key): LoRALinearTorch(
        in_features=768, out_features=768, bias=True
        (loras): ModuleDict()
      )
      (value): LoRALinearTorch(
        in_features=768, out_features=768, bias=True
        (loras): ModuleDict()
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (prefix_tuning): PrefixTuningLayer(
        (prefix_gates): ModuleDict()
        (pool): PrefixTuningPool(
          (prefix_tunings): ModuleDict()
        )
      )
    )
    (output): XLMRobertaSelfOutputWithAdapters(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (adapters): ModuleDict()
      (adapter_fusion_layer): Modul

In [45]:
model1.roberta.encoder.layer[0].output.adapters

False