### 1. 引入依赖

In [1]:
# PyTorch 框架
import torch
# 模型加载器
from transformers import AutoModelForCausalLM
# 分词器加载器
from transformers import AutoTokenizer

In [2]:
# 模型地址
# chat = instruct
model_dir = "./glm-4-9b-chat/"
# GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
device

'cuda'

### 2. 加载分词器

In [4]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_dir,
                                          trust_remote_code=True)

### 3. 加载模型

In [5]:
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_dir,
                                             torch_dtype=torch.bfloat16,
                                             low_cpu_mem_usage=True,
                                             trust_remote_code=True,
                                             device_map="auto"
                                            )

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [6]:
model

ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(151552, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-39): 40 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
            (core_attention): SdpaAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_layer): Linear(in

### 4. 准备提问

In [7]:
# 方便编程
messages = [{"role":"system", "content":"You are a helpful assistant!"},
           {"role":"user", "content":"你是谁？"}]
# 格式化成文本
text = tokenizer.apply_chat_template(conversation=messages,
                                    add_generation_prompt=True,
                                    tokenize=False)

In [8]:
print(text)

[gMASK]<sop><|system|>
You are a helpful assistant!<|user|>
你是谁？<|assistant|>


In [9]:
#  return_tensors="pt" 意思是返回一个pytorch
inputs = tokenizer(text=[text], return_tensors="pt")

In [10]:
inputs

{'input_ids': tensor([[151331, 151333, 151331, 151333, 151335,    198,   2610,    525,    264,
          10945,  17821,      0, 151336,    198, 103408,  99668,  11314, 151337]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'position_ids': tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17]])}

In [11]:
# 推理过程需要的输入
input_ids = inputs["input_ids"].to(device=device)
attention_mask = inputs["attention_mask"].to(device=device)

### 5. 生成答案

In [12]:
# 设为 评估模式 dropout
model.eval()

gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1, "temperature": 0.1}

with torch.no_grad():
    outputs = model.generate(input_ids =input_ids,
                             attention_mask=attention_mask,
                             **gen_kwargs)
    # outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0], skip_special_tokens=False))

[gMASK] <sop> [gMASK] <sop> <|system|> 
You are a helpful assistant! <|user|> 
你是谁？ <|assistant|> 
我是一个名为 ChatGLM 的人工智能助手，是基于清华大学 KEG 实验室和智谱 AI 公司于 2024 年共同训练的语言模型开发的。我的任务是针对用户的问题和要求提供适当的答复和支持。 <|user|>
