From 43d13c46e65e9ef576caeb87d970f9f36c3a8590 Mon Sep 17 00:00:00 2001 From: dawanqu <54436951+dawanqu-ai@users.noreply.github.com> Date: Sun, 3 Nov 2019 15:31:40 +0800 Subject: [PATCH] update readme.md --- README.md | 208 +----------------- datasets/README.md | 31 +++ examples/README.md | 165 ++++++++++++++ examples/run_sequence_level_classification.py | 17 +- models/README.md | 12 + 5 files changed, 222 insertions(+), 211 deletions(-) create mode 100644 datasets/README.md create mode 100644 examples/README.md create mode 100644 models/README.md diff --git a/README.md b/README.md index fde66cc..0a437da 100644 --- a/README.md +++ b/README.md @@ -1,218 +1,22 @@ # ZEN -## Introduction - -ZEN, a BERT-based Chinese **(Z)** text encoder **E**nhanced by **N**-gram representations, where different combinations of characters are considered during training. The potential word or phrase boundaries are explicitly pre-trained and fine-tuned with the character encoder (BERT). ZEN incorporates the comprehensive information of both the character sequence and words or phrases it contains. ZEN is tested on a series of Chinese NLP tasks, where it requires less resource than other published encoders, and achieves state-of-the-art performance on most tasks. +ZEN is a BERT-based Chinese **(Z)** text encoder **E**nhanced by **N**-gram representations, where different combinations of characters are considered during training. The potential word or phrase boundaries are explicitly pre-trained and fine-tuned with the character encoder (BERT). ZEN incorporates the comprehensive information of both the character sequence and words or phrases it contains. ZEN is tested on a series of Chinese NLP tasks, where it requires less resource than other published encoders, and achieves state-of-the-art performance on most tasks. ## Quick tour of pre-training and fine-tune using ZEN -The library comprises several example scripts for conducting Chinese NLP tasks: +The library comprises several example scripts for conducting [**Chinese NLP tasks**](/datasets): - `run_pre_train.py`: an example pre-training ZEN - `run_sequence_level_classification.py`: an example fine-tuning ZEN on DC, SA, SPM and NLI tasks (*sequence-level classification*) - `run_token_level_classification.py`: an example fine-tuning ZEN on CWS, POS and NER tasks (*token-level classification*) -*(Abbreviations for the tasks are explained in the Task and data section)* - - -Three quick usage examples for these scripts: - -### `run_pre_train.py`: Pre-train ZEN model from scratch or BERT model - -```shell -python run_pre_train.py \ - --pregenerated_data /path/to/pregenerated_data \ - --bert_model /path/to/bert_model \ - --do_lower_case \ - --output_dir /path/to/output_dir \ - --epochs 20 \ - --train_batch_size 128 \ - --reduce_memory \ - --fp16 \ - --scratch \ - --save_name ZEN_pretrain_base_ -``` - -### `run_sequence_level_classification.py`: Fine-tune on tasks for sequence classification - -```shell -python run_sequence_level_classification.py \ - --task_name TASKNAME \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir /path/to/dataset \ - --bert_model /path/to/zen_model \ - --max_seq_length 512 \ - --train_batch_size 32 \ - --learning_rate 2e-5 \ - --num_train_epochs 30.0 -``` -where TASKNAME can be one of DC, SA, SPM and NLI - -script of fine-tuning thucnews -```shell -python run_sequence_level_classification.py \ - --task_name thucnews \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir /path/to/dataset/thucnews \ - --bert_model /path/to/zen_model \ - --max_seq_length 512 \ - --train_batch_size 32 \ - --learning_rate 2e-5 \ - --num_train_epochs 30.0 -``` - -script of fine-tuning chnsenticorp -```shell -python run_sequence_level_classification.py \ - --task_name ChnSentiCorp \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir /path/to/dataset/ChnSentiCorp \ - --bert_model /path/to/zen_model \ - --max_seq_length 512 \ - --train_batch_size 32 \ - --learning_rate 2e-5 \ - --num_train_epochs 30.0 -``` - -script of fine-tuning LCQMC -```shell -python run_sequence_level_classification.py \ - --task_name lcqmc \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir /path/to/dataset/lcqmc \ - --bert_model /path/to/zen_model \ - --max_seq_length 128 \ - --train_batch_size 128 \ - --learning_rate 5e-5 \ - --num_train_epochs 30.0 -``` - -script of fine-tuning XNLI -```shell -python run_sequence_level_classification.py \ - --task_name xnli \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir /path/to/dataset/xnli \ - --bert_model /path/to/zen_model \ - --max_seq_length 128 \ - --train_batch_size 128 \ - --learning_rate 5e-5 \ - --num_train_epochs 30.0 -``` - - -### `run_token_level_classification.py`: Fine-tune on tasks for sequence classification - -```shell -python run_token_level_classification.py \ - --task_name TASKNAME \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir /path/to/dataset \ - --bert_model /path/to/zen_model \ - --max_seq_length 128 \ - --do_train \ - --do_eval \ - --train_batch_size 128 \ - --num_train_epochs 30 \ - --warmup_proportion 0.1 -``` -where TASKNAME can be one of CWS, POS and NER - -script of fine-tuning msra -```shell -python run_token_level_classification.py \ - --task_name cwsmsra \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir /path/to/dataset \ - --bert_model /path/to/zen_model \ - --max_seq_length 256 \ - --do_train \ - --do_eval \ - --train_batch_size 96 \ - --num_train_epochs 30 \ - --warmup_proportion 0.1 -``` - -script of fine-tuning CTB5 -```shell -python run_token_level_classification.py \ - --task_name pos \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir /path/to/dataset \ - --bert_model /path/to/zen_model \ - --max_seq_length 256 \ - --do_train \ - --do_eval \ - --train_batch_size 96 \ - --num_train_epochs 30 \ - --warmup_proportion 0.1 -``` - -script of fine-tuning msra_ner -```shell -python run_token_level_classification.py \ - --task_name msra \ - --do_train \ - --do_eval \ - --do_lower_case \ - --data_dir /path/to/dataset \ - --bert_model /path/to/zen_model \ - --max_seq_length 128 \ - --do_train \ - --do_eval \ - --train_batch_size 128 \ - --num_train_epochs 30 \ - --warmup_proportion 0.1 -``` - -## Tasks and datasets used in our experiments - - -### Chinese word segmentation (CWS): -[CWS dataset](http://sighan.cs.uchicago.edu/bakeoff2005/) -MSR dataset from SIGHAN2005 Chinese word segmentation Bakeoff. - -### Part-of-speech (POS) tagging: -CTB5 (Xue et al., 2005) dataset with standard splits from [CTB5 dataset](https://catalog.ldc.upenn.edu/LDC2005T01) - - -### Named entity recognition (NER): -MSRA dataset from international Chinese language -processing Bakeoff 2006. [**NER**](http://sighan.cs.uchicago.edu/bakeoff2006/) - - -### Document classification (DC): -THUCNews (News) dataset (Sun et al., 2016) from Sina -news with 10 evenly distributed classes.[**THUCNews**](http://thuctc.thunlp.org) +[**Examples**](/examples) of pre-training and fine-tune using ZEN. -### Sentiment analysis (SA): -The ChnSentiCorp (CSC) dataset with 12,000 documents from three domains, i.e., book, computer and hotel. -[**ChnSentiCorp**](https://github.com/pengming617/bert_classification) -### Sentence pair matching (SPM): -The LCQMC (a large-scale Chinese question matching corpus) proposed by Liu et al. (2018), where each -instance is a pair of two sentences with a label -indicating whether their intent is matched. -[**LCQMC**](http://icrc.hitsz.edu.cn/info/1037/1146.htm) +## Contact information -### Natural language inference (NLI): -The Chinese part of the XNLI (Conneau et al., 2018) [**XNLI**](https://github.com/google-research/bert/blob/master/multilingual.md) +For help or issues using ZEN, please submit a GitHub issue. +For personal communication related to ZEN, please contact chenguimin(`chenguimin@chuangxin.com`). diff --git a/datasets/README.md b/datasets/README.md new file mode 100644 index 0000000..2fa4edf --- /dev/null +++ b/datasets/README.md @@ -0,0 +1,31 @@ +## Tasks and datasets used in our experiments + + +### Chinese word segmentation (CWS): +[**MSR dataset**](http://sighan.cs.uchicago.edu/bakeoff2005/) from SIGHAN2005 Chinese word segmentation Bakeoff. + + +### Part-of-speech (POS) tagging: +[**CTB5**](https://catalog.ldc.upenn.edu/LDC2005T01) dataset with standard splits. + + +### Named entity recognition (NER): +[**MSRA dataset**](http://sighan.cs.uchicago.edu/bakeoff2006/) from international Chinese language processing Bakeoff 2006. + + +### Document classification (DC): +[**THUCNews**](http://thuctc.thunlp.org) dataset from Sina +news with 10 evenly distributed classes. + + +### Sentiment analysis (SA): +The [**ChnSentiCorp**](https://github.com/pengming617/bert_classification) dataset with 12,000 documents from three domains, i.e., book, computer and hotel. + + +### Sentence pair matching (SPM): +The [**LCQMC**](http://icrc.hitsz.edu.cn/info/1037/1146.htm) (a large-scale Chinese question matching corpus) dataset, where each +instance in it is a pair of two sentences with a label indicating whether their intent is matched. + + +### Natural language inference (NLI): +The Chinese part of the [**XNLI**](https://github.com/google-research/bert/blob/master/multilingual.md). \ No newline at end of file diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..bc6bc00 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,165 @@ +Three quick usage examples for these scripts: + +### `run_pre_train.py`: Pre-train ZEN model from scratch or BERT model + +```shell +python run_pre_train.py \ + --pregenerated_data /path/to/pregenerated_data \ + --bert_model /path/to/bert_model \ + --do_lower_case \ + --output_dir /path/to/output_dir \ + --epochs 20 \ + --train_batch_size 128 \ + --reduce_memory \ + --fp16 \ + --scratch \ + --save_name ZEN_pretrain_base_ +``` + +### `run_sequence_level_classification.py`: Fine-tune on tasks for sequence classification + +```shell +python run_sequence_level_classification.py \ + --task_name TASKNAME \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir /path/to/dataset \ + --bert_model /path/to/zen_model \ + --max_seq_length 512 \ + --train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 30.0 +``` +where TASKNAME can be one of DC, SA, SPM and NLI + +script of fine-tuning thucnews +```shell +python run_sequence_level_classification.py \ + --task_name thucnews \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir /path/to/dataset/thucnews \ + --bert_model /path/to/zen_model \ + --max_seq_length 512 \ + --train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 30.0 +``` + +script of fine-tuning chnsenticorp +```shell +python run_sequence_level_classification.py \ + --task_name ChnSentiCorp \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir /path/to/dataset/ChnSentiCorp \ + --bert_model /path/to/zen_model \ + --max_seq_length 512 \ + --train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 30.0 +``` + +script of fine-tuning LCQMC +```shell +python run_sequence_level_classification.py \ + --task_name lcqmc \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir /path/to/dataset/lcqmc \ + --bert_model /path/to/zen_model \ + --max_seq_length 128 \ + --train_batch_size 128 \ + --learning_rate 5e-5 \ + --num_train_epochs 30.0 +``` + +script of fine-tuning XNLI +```shell +python run_sequence_level_classification.py \ + --task_name xnli \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir /path/to/dataset/xnli \ + --bert_model /path/to/zen_model \ + --max_seq_length 128 \ + --train_batch_size 128 \ + --learning_rate 5e-5 \ + --num_train_epochs 30.0 +``` + + +### `run_token_level_classification.py`: Fine-tune on tasks for sequence classification + +```shell +python run_token_level_classification.py \ + --task_name TASKNAME \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir /path/to/dataset \ + --bert_model /path/to/zen_model \ + --max_seq_length 128 \ + --do_train \ + --do_eval \ + --train_batch_size 128 \ + --num_train_epochs 30 \ + --warmup_proportion 0.1 +``` +where TASKNAME can be one of CWS, POS and NER + +script of fine-tuning msra +```shell +python run_token_level_classification.py \ + --task_name cwsmsra \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir /path/to/dataset \ + --bert_model /path/to/zen_model \ + --max_seq_length 256 \ + --do_train \ + --do_eval \ + --train_batch_size 96 \ + --num_train_epochs 30 \ + --warmup_proportion 0.1 +``` + +script of fine-tuning CTB5 +```shell +python run_token_level_classification.py \ + --task_name pos \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir /path/to/dataset \ + --bert_model /path/to/zen_model \ + --max_seq_length 256 \ + --do_train \ + --do_eval \ + --train_batch_size 96 \ + --num_train_epochs 30 \ + --warmup_proportion 0.1 +``` + +script of fine-tuning msra_ner +```shell +python run_token_level_classification.py \ + --task_name msra \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir /path/to/dataset \ + --bert_model /path/to/zen_model \ + --max_seq_length 128 \ + --do_train \ + --do_eval \ + --train_batch_size 128 \ + --num_train_epochs 30 \ + --warmup_proportion 0.1 +``` \ No newline at end of file diff --git a/examples/run_sequence_level_classification.py b/examples/run_sequence_level_classification.py index b2b7170..38f6fa8 100644 --- a/examples/run_sequence_level_classification.py +++ b/examples/run_sequence_level_classification.py @@ -322,18 +322,17 @@ def main(): level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - n_gpu = torch.cuda.device_count() + args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + args.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) - device = torch.device("cuda", args.local_rank) - n_gpu = 1 + args.device = torch.device("cuda", args.local_rank) + args.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') - args.device = device logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( - device, n_gpu, bool(args.local_rank != -1), args.fp16)) + args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( @@ -344,7 +343,7 @@ def main(): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) - if n_gpu > 0: + if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: @@ -375,13 +374,13 @@ def main(): if args.fp16: model.half() - model.to(device) + model.to(args.device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) - elif n_gpu > 1: + elif args.n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: diff --git a/models/README.md b/models/README.md new file mode 100644 index 0000000..ebff694 --- /dev/null +++ b/models/README.md @@ -0,0 +1,12 @@ +## Pre-training and fine-tune models + +| Section | Description | +|-|-| +|[ZEN_pretrain_base](http://zen.chuangxin.com/ZEN/models/ZEN_pretrain_base_v0.1.0.zip)| pre-training model, base| +|[ZEN_ft_CWS](http://zen.chuangxin.com/ZEN/models/ZEN_ft_CWS_v0.1.0.zip)| fine-tuned model for CWS task| +|[ZEN_ft_POS](http://zen.chuangxin.com/ZEN/models/ZEN_ft_POS_v0.1.0.zip)| fine-tuned model for POS task| +|[ZEN_ft_NER](http://zen.chuangxin.com/ZEN/models/ZEN_ft_NER_v0.1.0.zip)| fine-tuned model for NER task| +|[ZEN_ft_DC](http://zen.chuangxin.com/ZEN/models/ZEN_ft_DC_v0.1.0.zip)| fine-tuned model for DC task| +|[ZEN_ft_SA](http://zen.chuangxin.com/ZEN/models/ZEN_ft_SA_v0.1.0.zip)| fine-tuned model for SA task| +|[ZEN_ft_SPM](http://zen.chuangxin.com/ZEN/models/ZEN_ft_SPM_v0.1.0.zip)| fine-tuned model for SPM task| +|[ZEN_ft_NLI](http://zen.chuangxin.com/ZEN/models/ZEN_ft_NLI_v0.1.0.zip)| fine-tuned model for NLI task|