In [3]:
from transformers import BertTokenizer,BertModel

In [2]:
import torch

In [6]:
tokenizer = BertTokenizer.from_pretrained("E:/bert-base-uncased")
model = BertModel.from_pretrained("E:/bert-base-uncased")

Some weights of the model checkpoint at E:/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# 把文字直接输入到tokenizer中，tokenizer处理后的结果作为BERT模型的输入   “pt”表示返回数据格式是pytorch的张量格式
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [8]:
# token_type_ids表示含义是  是否是一句话，一个字符串中的句子相当于一句话
inputs_ = tokenizer("Hello, my dog is cute","hello world", return_tensors="pt")
print(inputs_)

{'input_ids': tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102,  7592,  2088,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [9]:
# 超过最大长度的部分，attention_mask变成0
inputs__ = tokenizer("Hello, my dog is cute", return_tensors="pt", padding='max_length', max_length=10)
print(inputs__)

{'input_ids': tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}


In [10]:
'''
模型的输出有last_hidden_state 模型最后一层输出的隐藏层状态
pooler_output  <CLS>对应的最后一层隐藏层的输出
hidden_states 
attentions 
cross_attentions 
past_key_values 
'''
outputs = model(**inputs)

In [11]:
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1144,  0.1937,  0.1250,  ..., -0.3827,  0.2107,  0.5407],
         [ 0.5308,  0.3207,  0.3665,  ..., -0.0036,  0.7579,  0.0388],
         [-0.4877,  0.8849,  0.4256,  ..., -0.6976,  0.4458,  0.1231],
         ...,
         [-0.7003, -0.1815,  0.3297,  ..., -0.4838,  0.0680,  0.8901],
         [-1.0355, -0.2567, -0.0317,  ...,  0.3197,  0.3999,  0.1795],
         [ 0.6080,  0.2610, -0.3131,  ...,  0.0311, -0.6283, -0.1994]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-7.1946e-01, -2.1445e-01, -2.9576e-01,  3.6603e-01,  2.7968e-01,
          2.2184e-02,  5.7299e-01,  6.2331e-02,  5.9586e-02, -9.9965e-01,
          5.0146e-02,  4.4756e-01,  9.7612e-01,  3.3989e-02,  8.4494e-01,
         -3.6905e-01,  9.8649e-02, -3.7169e-01,  1.7371e-01,  1.1515e-01,
          4.4133e-01,  9.9525e-01,  3.7221e-01,  8.2881e-02,  2.1402e-01,
          6.8965e-01, -6.1042e-01,  8.7136e-01,  9.4158e-01,  5.737

In [12]:
import torch as nn
import torch.utils.data as Data
import torch.optim as optim

In [13]:
class MyDataSet(Data.Dataset):
    
    # 初始化MyDataSet对象需要传入参数data和label
    def __init__(self, data, label):
        self.data = data # ['今天天气很好',1]
        self.label = label # [1,0,2]
        self.tokenzier = BertTokenizer.from_pretrained("E:/bert-base-uncased")
        
    def __getitem__(self, idx):
        text = self.data[idx] # str
        label = self.label[idx]
        '使用预训练模型的tokenizer分词器，把输入文本数据处理成模型需要的tensor格式'
        inputs = self.tokenizer(text, return_tensors="pt", padding='max_length', max_length=10, trucation=True) # trucation为True表示超过最大长度进行截断
        input_ids = inputs.input_ids.squeeze(0) # 默认输出是二维的,转成一维的
        token_type_ids = inputs.token_type_ids.squeeze(0)
        attention_mask = inputs.attention_mask.squeeze(0)
        return input_ids,token_type_ids,attention_mask,label
        
        
    def __len__(self):
        return len(self.data)

In [18]:
data, label = [], []
with open("E:/testdata.txt", 'r', encoding='utf-8') as f:
    for line in f.readlines():
        data_, label_ = line.strip().split('\t')
        data.append(data_)
        label.append(int(label_))

dataset = MyDataSet(data, label)
dataloader = Data.DataLoader(dataset, batch_size=2, shuffle=True)  # torch.utils.data中的方法

In [31]:
"""
Bert后接Linear层
"""
class MyModel(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.bert = BertModel.from_pretrained("E:/bert-base-uncased")
        self.linear = nn.Linear(768, 3) # bert-base-uncased/config.json中hidden_size的值是768  分类结果有3类 
    
    def forward(self, input_ids, token_type_ids, attention_mask):
        output = self.bert(input_ids, token_type_ids, attention_mask).pooler_output  # .pooler_output获得<CLS>对应的输出向量 维度是[batch,hidden_size]
        output = self.linear(output)
        return output    

AttributeError: module 'torch' has no attribute 'Module'

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = MyModel().to(device)
loss_fn = nn.CrossEntropyLoss()  # 定义损失函数
optimizer = optim.Adam(model.parameters(), lr=le-5) # import torch.optim Adam优化器

In [None]:
for epoch in range(10):
    for input_ids, token_type_ids, attention_mask, label in dataloader:
        input_ids, token_type_ids, attention_mask, label = input_ids.to(device), token_type_ids.to(device), attention_mask.to(device), label.to(device)
        pred = model(input_ids, token_type_ids, attention_mask)
        
        loss = loss_fn(pred, label)
        print(loss.item())
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [None]:
"""
Bert后接CNN
"""
class MyModel(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.bert = BertModel.from_pretrained("E:/bert-base-uncased")
        self.conv = nn.Conv2d(1, 3, kernel_size=(2,768))  # 卷积层 输入通道是1，输出通道是3，卷积核大小是2*768
        self.linear = nn.Linear(27, 3) # CNN处理得到的输出维度是[batch,27]
    
    def forward(self, input_ids, token_type_ids, attention_mask):
        batch = input_ids.size(0)
        '''
        CNN的输入应该是4维的，[batch, channel, width, height]
        '''
        output = self.bert(input_ids, token_type_ids, attention_mask).last_hidden_states  # [batch, seq, hidden_size] 
        output = output.unsqueeze(1) # 在第1维上展开 [batch, 1， seq, hidden_size] 
        '''
        长是hidden_size=768，宽是epoch=10
        [（x + 2padding - kernal）/stride ] + 1
        长768作为x kernal大小是768带入上面的公式得到 （768+0-768）/1 +1 =0+1 =1
        宽10作为x kernal大小是2带入上公式得到（10-2）/1 +1=9
        '''
        output = self.conv(output) # 输出的维度是[batch, 3, 9, 1]  宽度为9，高度为1   
        output = output.view(batch, -1)  # 转化为二维的[batch,***]  得到[batch, 3*9*1]
        output = self.linear(output)
        return output    

In [None]:
"""
Bert后接LSTM
"""
class MyModel(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.bert = BertModel.from_pretrained("E:/bert-base-uncased")
        self.lstm = nn.LSTM(input_size=768, hidden_size=512, batch_first=True, bidirectional=True)  # 参数信息https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM
        self.linear = nn.Linear(1024, 3) # LSTM处理得到的输出维度是[batch,1024]
    
    def forward(self, input_ids, token_type_ids, attention_mask):
        batch = input_ids.size(0)
        '''
        x:[batch, seq]
        x=>word2Vex [batch, seq, dim]
        '''
        output = self.bert(input_ids, token_type_ids, attention_mask).last_hidden_states  # [batch, seq, hidden_size] 
        output,_ = self.lstm(output)  # output维度是[2,10,1024]  10是sequence的长度，1024=512*2
        output = output[:, -1, :]  # 处理成[batch,1024]的维度
        output = self.linear(output)
        return output    

In [None]:
"""
使用pytorch，最重要的是要弄明白每个层输入和输出的维度

nn.LSTM  nn.Linear 需要的维度是什么
"""