We do not want to initialize new model from scratch. We want to copy selective weights from parent to child model.

In [1]:
from copy import deepcopy
import torch
from pathlib import Path
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from typing import Dict

In [2]:
pretrained_dir = Path("pretrained/xlm-roberta-base")
#pretrained_dir = Path("pretrained/microsoft/mdeberta-v3-base")
output_dir = Path("output")
num_hidden_layers = 3
num_attention_heads = 4

In [3]:
parent_config = AutoConfig.from_pretrained(str(pretrained_dir))
print(parent_config)

XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



In [4]:
child_config = deepcopy(parent_config)
child_config.num_hidden_layers = num_hidden_layers
child_config.num_attention_heads = num_attention_heads
child = AutoModel.from_config(child_config)
print(child)

XLMRobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): 

In [5]:
parent = AutoModel.from_pretrained(str(pretrained_dir))
parent_sd = parent.state_dict()
print(parent_sd.keys())

Some weights of the model checkpoint at pretrained\xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


odict_keys(['embeddings.position_ids', 'embeddings.word_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.LayerNorm.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.laye

In [6]:
child_sd = child.state_dict()
for k in child_sd.keys():
    print(f"{k}, {child_sd[k].size()}")
    child_sd[k] = parent_sd[k]

embeddings.position_ids, torch.Size([1, 514])
embeddings.word_embeddings.weight, torch.Size([250002, 768])
embeddings.position_embeddings.weight, torch.Size([514, 768])
embeddings.token_type_embeddings.weight, torch.Size([1, 768])
embeddings.LayerNorm.weight, torch.Size([768])
embeddings.LayerNorm.bias, torch.Size([768])
encoder.layer.0.attention.self.query.weight, torch.Size([768, 768])
encoder.layer.0.attention.self.query.bias, torch.Size([768])
encoder.layer.0.attention.self.key.weight, torch.Size([768, 768])
encoder.layer.0.attention.self.key.bias, torch.Size([768])
encoder.layer.0.attention.self.value.weight, torch.Size([768, 768])
encoder.layer.0.attention.self.value.bias, torch.Size([768])
encoder.layer.0.attention.output.dense.weight, torch.Size([768, 768])
encoder.layer.0.attention.output.dense.bias, torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.weight, torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.bias, torch.Size([768])
encoder.layer.0.intermedi

In [7]:
%%time
child.load_state_dict(child_sd)

Wall time: 106 ms


<All keys matched successfully>

In [8]:
%%time
tokenizer = AutoTokenizer.from_pretrained(str(pretrained_dir))
tokenizer.save_pretrained(str(output_dir))
child.save_pretrained(str(output_dir))

Wall time: 3.45 s


In [9]:
model = AutoModelForSequenceClassification.from_pretrained(str(output_dir))
print(model)

Some weights of the model checkpoint at output were not used when initializing XLMRobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at output and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La