In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Configure the model for 4-bit loading using bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,        # Enable 4-bit quantization
    bnb_4bit_use_double_quant=True,   # Use double quantization (improves performance)
    bnb_4bit_quant_type='nf4',        # Use 'nf4' quantization (Non-Finite Quantization) for better precision
    bnb_4bit_compute_dtype='float16'  # Use float16 to reduce memory
)

# 模型名称
model_name = "meta-llama/Llama-2-7b-chat-hf"

# 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # 使用低精度推理
    device_map="auto"  # 自动分配设备 (CPU 或 MPS)
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from threading import Thread
from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer

MAX_INPUT_TOKEN_LENGTH = 4096
system_prompt = "summarize in no more than 30 words"
chat_history = []
message = """
iPhone and iPad apps will be made available on the Mac App Store to users running macOS 11 or later on Macs with Apple silicon, unless you edit your apps’ availability. There's no porting process since these apps use the same frameworks, resources, and runtime environments as they do on iOS and iPadOS.

If your iOS app offers universal purchase and has an existing macOS platform, the option to offer the iOS app on the Mac App Store won’t be available. Additionally, if your iOS app is available on the Mac App Store by using this option and you later add a macOS app by adding the macOS platform in App Store Connect, releasing it to the store will replace the iOS app on the Mac App Store. All existing users for the iOS app will be updated to the macOS app.

You can opt in and out from having your iPhone and iPad apps available on the Mac App Store for users running macOS 11 or later on Macs with Apple silicon at any point. This is set at the app level and will apply to all versions of your app.

Apple automatically chooses the minimum macOS version required for compatibility, but you can select a different version when editing availability on an individual app basis. If an LSMinimumSystemVersion is set in your app, this selection will override it.
"""

conversation = []
if system_prompt:
    conversation.append({"role": "system", "content": system_prompt})
conversation += chat_history
conversation.append({"role": "user", "content": message})

input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
    input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
    gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
input_ids = input_ids.to(model.device)

streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=False)
generate_kwargs = dict(
    {"input_ids": input_ids},
    streamer=streamer,
    max_new_tokens=1024,
    do_sample=True,
    top_p=0.9,
    top_k=50,
    temperature=0.6,
    num_beams=1,
    repetition_penalty=1.2,
)

t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()

try:
    outputs = []
    for text in streamer:
        outputs.append(text)
        print(text, end="", flush=True)
finally:
    t.join()

 Sure! Here's a summary of the article in 30 words or less:

Mac App Store now offers iPhone & iPad apps to macOS users; developers can choose to make them available or not; existing users will be upgraded to macOS app.</s>

In [None]:
conversation