In [1]:
import os

In [2]:
from transformers import AutoConfig
model_path = "qwen-1.5b"
config = AutoConfig.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config

Qwen2Config {
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "dtype": "bfloat16",
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 1536,
  "initializer_range": 0.02,
  "intermediate_size": 8960,
  "layer_types": [
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention"
  ],
  "max_position_embeddings": 131072,
  "max_window_layers": 28,
  "model_type": "qwen2",
 

In [4]:
# 从零初始化的模型
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotar

In [5]:
# 加载预训练模型
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_path)
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotar

In [6]:
# 加载预训练好的tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer

Qwen2TokenizerFast(name_or_path='qwen-1.5b', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=Fa

In [7]:
from datasets import load_dataset
ds = load_dataset('json', data_files='/data/zhanghudong/dataset/mobvoi_seq_monkey_general_open_corpus.jsonl')

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 13000000
    })
})

In [9]:
ds['train'][0]

{'text': '在查处虚开增值税专用发票案件中，常常涉及进项留抵税额和税款损失的认定和处理。在计算税款损失时，要不要将进项留抵税额包括在内？\n对此，实务中存在意见分歧。\n有人主张归并，即计算税款损失时包括进项留抵税额；\n有人主张剥离，即计算税款损失时剔除进项留抵税额。分析这个问题，需要确定进项留抵税额与税款损失之间是什么关系。\n理清这二者之间的关系，首先需要了解增值税的概念和其抵扣机制。增值税是以商品（货物、服务等）在流转过程中产生的增值额作为计税依据而征收的一种流转税。为避免重复征税，在增值税中存在抵扣链条机制。\n一般而言，交易上游企业缴纳的税额，交易下游企业可以对相应的税额进行抵扣。\n对增值税一般纳税人来说，其购进货物、服务等取得增值税专用发票，发票上的税额是进项税额。\n其出售货物、服务等，向购买方开具增值税专用发票，发票的税额是销项税额。\n一般情况下，销项税额减去进项税额的金额是应纳税额，企业根据应纳税额按期申报纳税。\n其次需要了解进项留抵税额的概念及产生原因。\n在计算销项税额和进项税额的差额时，有时会出现负数，即当期进项税额大于当期销项税额。这个差额在当期未实现抵扣，为进项留抵税额，在以后纳税人有销项税额时再进行抵扣。\n企业产生进项留抵税额的主要原因是其进项税额和销项税额时间上的不一致。\n例如，企业前期集中采购货物和服务，投资大，销项税率低于进项税率等。\n从税款抵扣的角度看，进项留抵税额只是购进的这部分进项税额参与到增值税应纳税额的计算过程中，但是其对应的进项税额抵扣还未真正实现，一般要等到其未来有相应的销项税额时，才能真正实现进项税额抵扣。\n可见，进项留抵税额处于不确定状态，能否抵扣受到很多因素影响，例如企业经营中断，没有销项税额，这时进项留抵税额就无法实现抵扣。但如果企业按照税收政策规定申请进项留抵退税，进项税额抵扣就随之实现。\n最后需要了解税款损失的概念。\n税款损失，通常是指因虚开增值税专用发票，导致国家税款被骗或者流失的金额。关于税款损失，实务中有多种表述。\n例如，北京大学法学院教授陈兴良曾谈到虚开行为本身不会造成国家税款损失，只有利用发票抵扣时才会造成国家税款损失。刘兵等编著的《虚开增值税专用发票案例司法观点和案例解析》一书中提到：“给国家税款造成损失的数额，实际上就是被骗取的国家税款在侦查终结以前无法追回的部

In [10]:
for key in tokenizer(ds['train'][0]['text']).keys():
    print(key)

input_ids
attention_mask


In [11]:
column_names = list(ds["train"].features)
column_names

['text']

In [12]:
def tokenize_function(examples):
    output = tokenizer([item for item in examples["text"]])
    return output

In [13]:
tokenize_function(ds['train'][0])

{'input_ids': [[18493], [32876], [44290], [100226], [29767], [49185], [25511], [84088], [95411], [11622], [28291], [94444], [80642], [14224], [15946], [3837], [38953], [38953], [100068], [81217], [41299], [47882], [99337], [99990], [84088], [61191], [33108], [84088], [68153], [99644], [20726], [9370], [28951], [22382], [33108], [44290], [21887], [1773], [18493], [37643], [69103], [84088], [68153], [99644], [20726], [13343], [3837], [30534], [16530], [30534], [44063], [41299], [47882], [99337], [99990], [84088], [61191], [67279], [100139], [18493], [31843], [11319], [198], [32664], [31991], [3837], [39973], [31952], [15946], [24360], [18493], [36589], [88970], [17177], [101802], [1773], [198], [18830], [17340], [35568], [86341], [100040], [62926], [3837], [91676], [37643], [69103], [84088], [68153], [99644], [20726], [13343], [67279], [100139], [41299], [47882], [99337], [99990], [84088], [61191], [24968], [198], [18830], [17340], [35568], [86341], [101819], [99372], [3837], [91676], [3

In [14]:
tokenized_datasets = ds.map(
    tokenize_function,
    batched=True,
    num_proc=100,
    remove_columns=column_names,
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)

In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 13000000
    })
})

In [16]:
tokenized_datasets['train'][0]['input_ids']

[18493,
 106416,
 100226,
 29767,
 109742,
 105223,
 107717,
 101995,
 15946,
 3837,
 104495,
 102031,
 117743,
 99337,
 99990,
 84088,
 61191,
 33108,
 84088,
 68153,
 102170,
 9370,
 104585,
 33108,
 54542,
 1773,
 18493,
 100768,
 84088,
 68153,
 102170,
 13343,
 3837,
 111343,
 44063,
 117743,
 99337,
 99990,
 84088,
 61191,
 100630,
 18493,
 31843,
 94432,
 104270,
 3837,
 118603,
 15946,
 47606,
 100065,
 110691,
 8997,
 101114,
 106509,
 100040,
 62926,
 3837,
 91676,
 100768,
 84088,
 68153,
 102170,
 13343,
 100630,
 117743,
 99337,
 99990,
 84088,
 61191,
 59217,
 101114,
 106509,
 118266,
 3837,
 91676,
 100768,
 84088,
 68153,
 102170,
 13343,
 103869,
 20755,
 117743,
 99337,
 99990,
 84088,
 61191,
 1773,
 101042,
 105073,
 3837,
 85106,
 60610,
 117743,
 99337,
 99990,
 84088,
 61191,
 57218,
 84088,
 68153,
 102170,
 101920,
 102021,
 100145,
 8997,
 21887,
 79766,
 43288,
 110566,
 104186,
 100145,
 3837,
 101140,
 85106,
 99794,
 109742,
 107402,
 33108,
 41146,
 9999

In [17]:
from itertools import chain

block_size=2048
def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    
    result = {
        k: [t[i:i+block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result['labels'] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=100,
    load_from_cache_file=True,
    desc = f'Grouping text in chunks of {block_size}',
    batch_size=40000,
)
train_dataset = lm_datasets["train"]

In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='output',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=1,
    save_steps=100,
    learning_rate=1e-4,
    gradient_checkpointing=True,
)

In [None]:
from transformers import Trainer, default_data_collator
from torchdata.datapipes.iter import IterableWrapper

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=IterableWrapper(train_dataset),
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)
trainer.train()

In [1]:
import torch
print(torch.__version__)

2.8.0+cu129
