<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/llm/orca13B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installation

In [None]:
!pip install -q auto-gptq

Load model

In [None]:
from transformers import AutoTokenizer, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import argparse

model_name_or_path = "TheBloke/orca_mini_13B-GPTQ"
model_basename = "orca-mini-13b-GPTQ-4bit-128g.no-act.order"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
        model_basename=model_basename,
        use_safetensors=True,
        trust_remote_code=False,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=None)

Inference with decode

In [None]:
# Note: check the prompt template is correct for this model.
prompt = "Please translate the following English context into Japanese. \ Context: In the realm of large language models (LLMs), there has been a constant pursuit to enhance the capabilities of smaller models without compromising their efficiency. The traditional approach has been to use imitation learning, where smaller models learn from the outputs generated by large foundation models (LFMs). However, this approach has been marred by several challenges, including limited imitation signals from shallow LFM outputs, small-scale homogeneous training data, and a lack of rigorous evaluation. This often leads to smaller models imitating the style but not the reasoning process of LFMs. The paper Orca: Progressive Learning from Complex Explanation Traces of GPT-4 introduces Orca, a 13-billion parameter model designed to imitate the reasoning process of large foundation models (LFMs) such as GPT-4. Unlike traditional large language models (LLMs), Orca employs a unique training approach that combines progressive learning and teacher assistance to overcome the capacity gap between smaller student models and their larger counterparts."
prompt_template=f'''USER: {prompt}
ASSISTANT:'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=1024)
print(tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt_template):])

In [None]:
prompt_template = '''You act as a prompt generator. Please do the classification task like the following examples.

USER: replace chair into sofa at the left side
ASSISTANT: {"part": "left", "source object": ["chair"], "target object": ["sofa"]}

USER: change wooden table into white table in the right part
ASSISTANT: {"part": "right", "source object": ["wooden table"], "target object": ["white table"]}

USER: replace blue chair and red sofa into yellow table and green chair at the bottom
ASSISTANT: {"part": "bottom", "source object": ["blue chair", "red sofa"], "target object": ["yellow table", "green chair"]}

USER: replace sofa and shelf into chair and picture at the right side in the room
ASSISTANT:'''

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=1024)
print(tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt_template):])

Inference with pipeline

In [None]:
# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
logging.set_verbosity(logging.CRITICAL)

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15
)

print(pipe(prompt_template)[0]['generated_text'])