In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import numpy as np
from torch.utils.data import TensorDataset,RandomSampler,DataLoader
from dataset_extractor import *
import matplotlib.pyplot as plt
from NERtable import NERtable

In [2]:
tokenizer=AutoTokenizer.from_pretrained("./bert-base-chinese")
model=AutoModelForTokenClassification.from_pretrained("./bert-base-chinese",num_labels=3)
print(model)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ./bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [3]:
train_dataset=extract_format('PRC_legal_dataset/data_train_format_with_NER.json')
print(NERtable)

{0: 'O', 1: 'B-criminal', 2: 'I-criminal', 3: 'B_keyfact', 4: 'I_keyfact'}


In [4]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from transformers import get_scheduler
max_length=512
train_tokens=tokenizer(train_dataset['content'],max_length=max_length,truncation=True,padding="max_length",return_tensors='pt')
print(train_tokens['input_ids'].shape[1])

512


In [5]:
NER_label_padded=[
    labels + [0]*(max_length-len(labels)) for labels in train_dataset['NER']
]
NER_label_padded_truncate=[labels[:512] for labels in NER_label_padded]
print(len(NER_label_padded_truncate[1]))

512


In [6]:
batch_size=45
train_tensor=TensorDataset(train_tokens['input_ids'],train_tokens['attention_mask'],torch.tensor(NER_label_padded_truncate))
train_sampler=RandomSampler(train_tensor)
train_dataloader=DataLoader(dataset=train_tensor,batch_size=batch_size,sampler=train_sampler)
optimizer=AdamW(model.parameters(),lr=2e-5)
num_epochs=8
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [7]:
device=torch.device("cuda",0)
def train(model):
    model.to(device)
    for epoch in range(num_epochs):
        model.train() 
        total_loss=0
        for step,batch in enumerate(train_dataloader):
            if step % 10 == 0 and not step == 0:
                print("step: ",step, "  loss:",total_loss/(step*batch_size))
            b_input_ids=batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()
            outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
            loss = outputs.loss     # include cross-entropy loss   
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #防止梯度爆炸
            optimizer.step()
            lr_scheduler.step()
        avg_train_loss = total_loss / len(train_dataloader)      
        print("avg_loss:",avg_train_loss)
        model.save_pretrained(f"bert-base-legal-chinese-NER-epoch{epoch+1}")

In [8]:
train(model)

step:  10   loss: 0.008028993556896846
step:  20   loss: 0.00462384881451726
step:  30   loss: 0.003241845752905916
step:  40   loss: 0.002513374341134396
step:  50   loss: 0.0020598426171474986
step:  60   loss: 0.001749931666482654
step:  70   loss: 0.0015243163007119345
step:  80   loss: 0.0013491198681165567
step:  90   loss: 0.001212960228346932
step:  100   loss: 0.0011017430122527811
step:  110   loss: 0.001010968280448155
step:  120   loss: 0.0009356184983281073
step:  130   loss: 0.0008719570635475664
step:  140   loss: 0.0008162597243097567
step:  150   loss: 0.000766235354822129
step:  160   loss: 0.0007215626137945543
step:  170   loss: 0.0006836800754307678
step:  180   loss: 0.00064884242212923
step:  190   loss: 0.0006179426012530771
step:  200   loss: 0.0005900089748902246
step:  210   loss: 0.0005649888581379499
step:  220   loss: 0.000542355342509432
step:  230   loss: 0.0005212996898956849
step:  240   loss: 0.0005024993087944393
step:  250   loss: 0.0004859564739253