In [1]:
from transformers import BertConfig, BertModel

# Building the config
config = BertConfig()

# Building the model from the config
model = BertModel(config)

# Model is randomly initialized!

In [2]:
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.37.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [None]:
# GitHub Copilot: This configuration is for a BERT model. Here's what each parameter means:

# - `"attention_probs_dropout_prob": 0.1`: The dropout probability for the attention probabilities.
# - `"classifier_dropout": null`: The dropout probability for the classifier. `null` means it's not set.
# - `"hidden_act": "gelu"`: The non-linear activation function in the encoder and pooler layers. Here, it's Gaussian Error Linear Units (GELU).
# - `"hidden_dropout_prob": 0.1`: The dropout probability for all fully connected layers in the embeddings and encoder.
# - `"hidden_size": 768`: The size of the "hidden" layer in the Transformer encoder.
# - `"initializer_range": 0.02`: The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
# - `"intermediate_size": 3072`: The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
# - `"layer_norm_eps": 1e-12`: The epsilon to use in the layer normalization layers.
# - `"max_position_embeddings": 512`: The maximum sequence length that this model might ever be used with.
# - `"model_type": "bert"`: The type of the model.
# - `"num_attention_heads": 12`: The number of attention heads for each attention layer in the Transformer encoder.
# - `"num_hidden_layers": 12`: The number of hidden layers in the Transformer encoder.
# - `"pad_token_id": 0`: The id of the padding token.
# - `"position_embedding_type": "absolute"`: The type of position embeddings.
# - `"transformers_version": "4.37.1"`: The version of the transformers library.
# - `"type_vocab_size": 2`: The vocabulary size of the `token_type_ids` passed when calling `BertModel` or `TFBertModel`.
# - `"use_cache": true`: Whether or not the model uses the past key/values to speed up decoding.
# - `"vocab_size": 30522`: The vocabulary size of the `inputs_ids` passed when calling `BertModel` or `TFBertModel`.

In [3]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-cased")

# config.json, pytorch_model.bin

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
# to save model
# model.save_pretrained("directory_on_my_computer")

In [9]:
ls  ~/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e

[0m[01;36mconfig.json[0m@  [01;36mmodel.safetensors[0m@
