**Bert模型的基本结构**

In [1]:
from transformers import AutoModel

model = AutoModel.from_pretrained("bert-base-cased")

print(model)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

**Bert模型的数据预处理**

In [2]:
from transformers import AutoProcessor

tokenizer = AutoProcessor.from_pretrained("bert-base-cased")

print(tokenizer)

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


**Bert模型基于pytorch的实现（chatGPT-4完成）**

In [None]:
import torch.nn as nn

class BertClassifier(nn.Module):
    def __init__(self, num_classes, hidden_size=768, num_layers=12, num_heads=12, dropout=0.1):
        super(BertClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.embedding_size = hidden_size
        
        self.embedding = nn.Embedding(30000, self.embedding_size, padding_idx=0)
        self.position_encoding = nn.Embedding(512, self.embedding_size)
        self.segment_encoding = nn.Embedding(2, self.embedding_size)
        
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=self.embedding_size,
                nhead=self.num_heads,
                dim_feedforward=self.hidden_size * 4,
                dropout=dropout,
                activation='gelu'
            )
            for _ in range(self.num_layers)
        ])
        
        self.pooler = nn.Linear(self.hidden_size, self.hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        embeddings = self.embedding(input_ids) * math.sqrt(self.embedding_size)
        position_embeddings = self.position_encoding(torch.arange(embeddings.size(1), device=embeddings.device))[None, :, :].expand(embeddings.size(0), -1, -1)
        segment_embeddings = self.segment_encoding(token_type_ids)
        embeddings = embeddings + position_embeddings + segment_embeddings
        
        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        attention_mask = attention_mask.repeat(1, self.num_heads, embeddings.size(1), 1)
        attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)
        attention_mask = (1.0 - attention_mask) * -10000.0
        
        for i in range(self.num_layers):
            embeddings = self.encoder_layers[i](embeddings, attention_mask)
        
        pooled_output = self.pooler(embeddings[:, 0])
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits