# Instantiating Model

In [1]:
# Loading a model from a checkpoint (or local folder) require 3 ingredients
# 1. Config file -> Config class
# 2. Config file -> Model class (config class) -> model (architecture)
# 3. Model file (containing the model weights) -> pretrained model

# Different ways to instantiate a model
# 1. AutoModel

import torch
from transformers import BertConfig, BertModel, AutoModel

ckpt = "bert-base-uncased"
bert_model = AutoModel.from_pretrained(ckpt) # AutoModel.from_pretrained()
print(type(bert_model))

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 1.21MB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [00:09<00:00, 45.4MB/s] 


<class 'transformers.models.bert.modeling_bert.BertModel'>


In [3]:
# 2. from config

from transformers import BertConfig, BertModel

bert_config = BertConfig()
print(bert_config) # build default config from config class

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [5]:
bert_config = BertConfig.from_pretrained(ckpt)
print(bert_config) # from pre-trained checkpoint (note the additional architectures attribute)
bert_model = BertModel(bert_config) # model is initialised with random weights !
print(bert_model)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertL

In [6]:
bert_config = BertConfig.from_pretrained(ckpt, num_hidden_layers = 10) # overriding architecture
bert_model = BertModel(bert_config) 

# Instantiating Tokenizers

In [11]:
from transformers import BertTokenizer, AutoTokenizer

# Loading tokenizers will be like loading models.
# We need a tokenizer class, and a checkpoint that loads algorithm and vocabulary

# bert_token = BertTokenizer() # won't work since it requires vocab file
bert_token = BertTokenizer.from_pretrained(ckpt)
encoded_sequence = bert_token('I go to school by bus')
print(encoded_sequence)

bert_token = AutoTokenizer.from_pretrained(ckpt)
encoded_sequence = bert_token('I go to school by bus')
print(encoded_sequence)

{'input_ids': [101, 1045, 2175, 2000, 2082, 2011, 3902, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 748kB/s]

{'input_ids': [101, 1045, 2175, 2000, 2082, 2011, 3902, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}





In [12]:
# Tokenization in action
# raw text -> tokens -> add special tokens -> embeddings (input ids, mask, ...)

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(ckpt)
tokens = tokenizer.tokenize('I go to school by bus')
print(tokens)

['i', 'go', 'to', 'school', 'by', 'bus']


In [13]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

[1045, 2175, 2000, 2082, 2011, 3902]


In [14]:
input_ids = tokenizer.prepare_for_model(input_ids)
print(input_ids) # note how it is wrapped into a dict as model input

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [101, 1045, 2175, 2000, 2082, 2011, 3902, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [17]:
decoded_string = tokenizer.decode(input_ids['input_ids'])
print(decoded_string)

[CLS] i go to school by bus [SEP]


# Batching Inputs Together

In [None]:
from transformers import AutoTokenizer
