In [None]:
!pip install -U torch==2.0.1

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
def generate_prompts(tokenizer, dataset, messages_column_name="messages", num_prompts=3):
    messages = dataset.shuffle().select(range(num_prompts))[messages_column_name]
    system_prompt = {
        "content": "You are a helpful chatbot",
        "role": "system"
    }
    prompts = []
    for message in messages:
        convo = [system_prompt] + message[:-1]
        prompt = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
        prompts.append(prompt)
    return prompts

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "robertgshaw2/mistral-sft-dense-run3-10k-samples"
dataset_path = "HuggingFaceH4/ultrachat_200k"

model = AutoModelForCausalLM.from_pretrained(model_path, device_map={"":0}, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.58s/it]
Downloading (…)neration_config.json: 100%|██████████| 111/111 [00:00<00:00, 877kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 1.34k/1.34k [00:00<00:00, 17.7MB/s]
Downloading tokenizer.model: 100%|██████████| 493k/493k [00:00<00:00, 27.3MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.80M/1.80M [00:00<00:00, 35.8MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 551/551 [00:00<00:00, 3.69MB/s]


In [6]:
print(tokenizer.chat_template)

{% for message in messages %}
{% if message['role'] == 'user' %}
{{ '<|user|>
' + message['content'] + eos_token }}
{% elif message['role'] == 'system' %}
{{ '<|system|>
' + message['content'] + eos_token }}
{% elif message['role'] == 'assistant' %}
{{ '<|assistant|>
'  + message['content'] + eos_token }}
{% endif %}
{% if loop.last and add_generation_prompt %}
{{ '<|assistant|>' }}
{% endif %}
{% endfor %}


In [7]:
from datasets import load_dataset

dataset = load_dataset(dataset_path, split="train_sft")

In [17]:
tokenizer.pad_token_id = tokenizer.eos_token_id 

In [29]:
import time

prompts = generate_prompts(tokenizer, dataset, num_prompts=1)

start = time.perf_counter()
for prompt in prompts:
    inps = tokenizer(prompt, return_tensors="pt")
    for key in inps:
        inps[key] = inps[key].to("cuda")
    
    outputs_tokens = model.generate(**inps, max_new_tokens=1000)

    print(tokenizer.batch_decode(outputs_tokens)[0])
    print("\n\n\n")
torch.cuda.synchronize()
end = time.perf_counter()

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> <|system|>
You are a helpful chatbot</s> 
<|user|>
Can you provide a summary of the latest PMI surveys indicating global economic growth in September?: The following is an extract from IHS Markit's monthly PMI overview presentation. For the full report please click on the link at the bottom of the article.
Global economic growth moderated to a two-year low in September in a broad-based slowdown, according to the latest PMI surveys. The headline JPMorgan Global Composite PMI, compiled by IHS Markit, fell for a third successive month in September, down from 53.4 in August to 52.8, its lowest since September 2016. The latest reading is indicative of annual global GDP growth slipping below 2.5% (at market exchange rates). Business confidence about the year ahead likewise fell to the gloomiest for two years, suggesting growth may weaken further in coming months.
Slower growth was recorded across both manufacturing and services, the former down to a two-year low while the latter showed t

In [30]:
start_tokens = inps["input_ids"].shape[1]
total_tokens = outputs_tokens.shape[1]

print(f"total_tokens = {total_tokens}")
print(f"new_tokens = {total_tokens - start_tokens}")
print(f"num_seconds = {end - start}")
print(f"new tok/sec = {(total_tokens - start_tokens) / (end - start)}")

total_tokens = 1106
new_tokens = 168
num_seconds = 7.8967392379418015
new tok/sec = 21.274603977399074


In [2]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype=torch.float16,
#     device_map={'':0}
# )

# tokenizer = AutoTokenizer.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.06s/it]


In [1]:
from fastchat.model import load_model, get_conversation_template
import torch

# model_path = "/home/rshaw/zephyr-training/pruning/data/zephyr-50sparse-fp16-one-shot-v0"
model_path = "HuggingFaceH4/zephyr-7b-beta"
model_id = "zephyr-beta"

model, tokenizer = load_model(
    model_path,
    device="cuda",
    num_gpus=1,
    dtype=torch.float16,
    load_8bit=False,
    cpu_offloading=False,
    debug=False,
)

  from .autonotebook import tqdm as notebook_tqdm


<fastchat.model.model_adapter.ZephyrBetaAdapter object at 0x7fe693d9c850>


Loading checkpoint shards: 100%|██████████| 8/8 [00:27<00:00,  3.41s/it]


In [4]:
from FastChat.fastchat.llm_judge.common import load_questions, temperature_config

question_file = "FastChat/fastchat/llm_judge/data/mt_bench/question.jsonl"
questions = load_questions(question_file, None, None)
question = questions[2]
print(question)

conv = get_conversation_template(model_id)

turns = []
for j in range(len(question["turns"])):
    qs = question["turns"][j]
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    do_sample=True
    temperature=0.7
    print(prompt)
    
    input_ids = tokenizer([prompt]).input_ids

    output_ids = model.generate(
        torch.as_tensor(input_ids).cuda(),
        do_sample=do_sample,
        temperature=temperature,
        max_new_tokens=1024,
        use_cache=True
    )

    break

{'question_id': 83, 'category': 'writing', 'turns': ['Imagine you are writing a blog post comparing two popular smartphone models. Develop an outline for the blog post, including key points and subheadings to effectively compare and contrast the features, performance, and user experience of the two models. Please answer in fewer than 200 words.', 'Take your previous response and rephrase it as a limerick.']}
<|system|>
</s>
<|user|>
Imagine you are writing a blog post comparing two popular smartphone models. Develop an outline for the blog post, including key points and subheadings to effectively compare and contrast the features, performance, and user experience of the two models. Please answer in fewer than 200 words.</s>
<|assistant|>



In [6]:
print(tokenizer.batch_decode(output_ids)[0])

<s> <|system|>
</s> 
<|user|>
Imagine you are writing a blog post comparing two popular smartphone models. Develop an outline for the blog post, including key points and subheadings to effectively compare and contrast the features, performance, and user experience of the two models. Please answer in fewer than 200 words.</s> 
<|assistant|>
Title: Smartphone Showdown: iPhone 12 vs. Samsung Galaxy S21

Introduction:

Smartphones have become a vital part of our daily lives, and with new models coming out every year, it can be overwhelming to decide which one to pick. In this blog post, we will compare two popular smartphone models, the iPhone 12, and Samsung Galaxy S21. We will take a closer look at their features, performance, and user experience to help you make an informed decision.

1. Design and Display

- iPhone 12: The iPhone 12 comes with a new design, featuring a flat-edged rectangular shape. It has a 6.1-inch Super Retina XDR OLED display with a 2532 x 1170 resolution. The displ