In [1]:
import utils
import config
import logging
import numpy as np
from data_process import Processor
from data_loader import NERDataset
from model_small import BertNER
from train import train, evaluate
from transformers import (
  BertTokenizerFast,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers.optimization import get_cosine_schedule_with_warmup, AdamW

import warnings
import torch
import os
warnings.filterwarnings('ignore')
import pickle


def dev_split(dataset_dir):
    """split dev set"""
    data = np.load(dataset_dir, allow_pickle=True)
    words = data["words"]
    labels = data["labels"]
    x_train, x_dev, y_train, y_dev = train_test_split(words, labels, test_size=config.dev_split_size, random_state=0)
    return x_train, x_dev, y_train, y_dev


def load_dev(mode):
    if mode == 'train':
        # 分离出验证集
        word_train, word_dev, label_train, label_dev = dev_split(config.train_dir)
    elif mode == 'test':
        train_data = np.load(config.train_dir, allow_pickle=True)
        dev_data = np.load(config.test_dir, allow_pickle=True)
        word_train = train_data["words"]
        label_train = train_data["labels"]
        word_dev = dev_data["words"]
        label_dev = dev_data["labels"]
    elif mode == 'predict':
        train_data = np.load(config.train_dir, allow_pickle=True)
        predict_data = np.load(config.predict_dir, allow_pickle=True)
        word_train = train_data["words"]
        label_train = train_data["labels"]
        word_dev = predict_data["words"]
        label_dev = predict_data["labels"]
    else:
        word_train = None
        label_train = None
        word_dev = None
        label_dev = None
    return word_train, word_dev, label_train, label_dev

def run(check_model_dir = ''):
    # check_model_dir = '/data/fuwen/SuWen/news_get_name_ner/src/experiments/s_model/bs64 5 90.6/'
    filename = config.model_dir
    if not os.path.exists(filename):               #判断文件夹是否存在
        os.makedirs(filename)                       #新建文件夹
    device = config.device
    """train the model"""
    # set the logger
    utils.set_logger(config.log_dir)
    logging.info("device: {}".format(device))
    batch_size=config.batch_size
    logging.info("batch_size: {}".format(batch_size))
    try:
        with open(filename+"train_loader"+str(batch_size)+".pkl",'rb') as f:
            train_loader  = pickle.loads(f.read())
        with open(filename+"dev_loader"+str(batch_size)+".pkl",'rb') as f:
            dev_loader  = pickle.loads(f.read())
        logging.info("--------Get Dataloader!--------")
        train_size = 43092 # 这个数据由第一遍训练输出的大小来填，输出格式如下：
        # --------Dataset Build!--------
        #             43092
        # --------Get Dataloader!--------

    except:
        try:
            with open(filename+"train_dataset.pkl",'rb') as f:
                train_dataset  = pickle.loads(f.read())
            with open(filename+"dev_dataset.pkl",'rb') as f:
                dev_dataset  = pickle.loads(f.read())
            
            logging.info("--------Dataset Build!--------")

        except:
            # 处理数据，分离文本和标签
            processor = Processor(config)
            processor.process()
            logging.info("--------Process Done!--------")
            # 分离出验证集
            word_train, word_dev, label_train, label_dev = load_dev('train')
            logging.info("--------load_dev !--------")
            # build dataset
            tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
            train_dataset = NERDataset(word_train, label_train, config, tokenizer)
            dev_dataset = NERDataset(word_dev, label_dev, config, tokenizer)
            with open(filename+'train_dataset.pkl', 'wb') as f:
                pickle.dump(train_dataset, f)
            with open(filename+'dev_dataset.pkl', 'wb') as f:
                pickle.dump(dev_dataset, f)
            logging.info("--------Dataset Build!--------")
            
        # get dataset size
        train_size = len(train_dataset)
        print(train_size)
        # build data_loader
        train_loader = DataLoader(train_dataset, batch_size=config.batch_size,
                                    shuffle=True, collate_fn=train_dataset.collate_fn)
        dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size,
                                shuffle=True, collate_fn=dev_dataset.collate_fn)
        with open(filename+'train_loader'+str(batch_size)+'.pkl', 'wb') as f:
                pickle.dump(train_loader, f)
        with open(filename+'dev_loader'+str(batch_size)+'.pkl', 'wb') as f:
            pickle.dump(dev_loader, f)
        logging.info("--------Get Dataloader!--------")


    # Prepare model
    if check_model_dir!='':
        model = BertNER.from_pretrained(check_model_dir)
        logging.info("--------Load model from {}--------".format(check_model_dir))
    else:
        model = BertNER.from_pretrained('ckiplab/albert-tiny-chinese',num_labels=len(config.label2id))
        logging.info("--------Create model from {}--------".format('ckiplab/albert-tiny-chinese'))

    model.to(device)
    # train_loader.to(device)
    # dev_loader.to(device)
    # Prepare optimizer
    if config.full_fine_tuning:
        # model.named_parameters(): [bert, bilstm, classifier, crf]
        bert_optimizer = list(model.bert.named_parameters())
        # lstm_optimizer = list(model.bilstm.named_parameters())
        classifier_optimizer = list(model.classifier.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in bert_optimizer if not any(nd in n for nd in no_decay)],
                'weight_decay': config.weight_decay},
            {'params': [p for n, p in bert_optimizer if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0},
            # {'params': [p for n, p in lstm_optimizer if not any(nd in n for nd in no_decay)],
                # 'lr': config.learning_rate * 20, 'weight_decay': config.weight_decay},
            # {'params': [p for n, p in lstm_optimizer if any(nd in n for nd in no_decay)],
                # 'lr': config.learning_rate * 20, 'weight_decay': 0.0},
            {'params': [p for n, p in classifier_optimizer if not any(nd in n for nd in no_decay)],
                'lr': config.learning_rate * 20, 'weight_decay': config.weight_decay},
            {'params': [p for n, p in classifier_optimizer if any(nd in n for nd in no_decay)],
                'lr': config.learning_rate * 20, 'weight_decay': 0.0},
            {'params': model.crf.parameters(), 'lr': config.learning_rate * 20}
        ]
    # only fine-tune the head classifier
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, correct_bias=False)
    
    train_steps_per_epoch = train_size // config.batch_size
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=(config.epoch_num // 10) * train_steps_per_epoch,
                                                num_training_steps=config.epoch_num * train_steps_per_epoch)

    # Train the model
    logging.info("--------Start Training!--------")
    train(train_loader, dev_loader, model, optimizer, scheduler, config.model_dir)



cuda:0


In [2]:
run()

device: cuda:0
batch_size: 128
  7%|▋         | 3549/48820 [00:03<00:46, 970.68it/s]

bad token：【 　 】


 19%|█▉        | 9315/48820 [00:10<00:45, 868.90it/s] 

bad token：【 ﻿ 】


 21%|██        | 10023/48820 [00:10<00:39, 976.51it/s]

bad token：【 　 】


 36%|███▌      | 17335/48820 [00:18<00:31, 1002.65it/s]

bad token：【 　 】


 62%|██████▏   | 30239/48820 [00:32<00:19, 938.80it/s] 

bad token：【 　 】


 63%|██████▎   | 30518/48820 [00:32<00:20, 905.23it/s]

bad token：【 　 】


 65%|██████▌   | 31878/48820 [00:34<00:19, 882.12it/s]

bad token：【 　 】


 76%|███████▋  | 37278/48820 [00:40<00:12, 906.34it/s] 

bad token：【 　 】


 79%|███████▊  | 38366/48820 [00:41<00:12, 867.63it/s]

bad token：【 	 】
bad token：【 	 】
bad token：【 	 】


 88%|████████▊ | 42993/48820 [00:46<00:05, 972.85it/s] 

bad token：【 　 】
bad token：【 	 】
bad token：【 　 】
bad token：【 　 】


 97%|█████████▋| 47449/48820 [00:51<00:01, 941.35it/s]

bad token：【 　 】


100%|██████████| 48820/48820 [00:52<00:00, 924.54it/s]
--------train data process DONE!--------
--------Process Done!--------
--------load_dev !--------
100%|██████████| 43092/43092 [00:48<00:00, 896.24it/s]
100%|██████████| 4789/4789 [00:05<00:00, 910.15it/s]
--------Dataset Build!--------


43092


--------Get Dataloader!--------
You are using a model of type albert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at ckiplab/albert-tiny-chinese were not used when initializing AlbertModel: ['predictions.dense.weight', 'predictions.dense.bias', 'predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertModel were not initialized fro

error: 0


100%|██████████| 38/38 [00:10<00:00,  3.60it/s]
Epoch: 1, dev loss: 1009.946199115954, f1 score: 0.30032864245386365
--------Save best model!--------
 91%|█████████ | 306/337 [00:57<00:05,  5.29it/s]


RuntimeError: CUDA out of memory. Tried to allocate 394.00 MiB (GPU 0; 6.00 GiB total capacity; 4.45 GiB already allocated; 0 bytes free; 5.21 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF