In [None]:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")

# Pass the default decoding hyperparameters of Qwen2.5-7B-Instruct
# max_tokens is for the maximum length for generation.
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)

# Input the model name or path. Can be GPTQ or AWQ models.
llm = LLM(model="Qwen/Qwen2.5-7B-Instruct")

# Prepare your prompts
prompt = "Tell me something about large language models."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# generate outputs
outputs = llm.generate([text], sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

In [None]:
wget https://huggingface.co/bartowski/Llama-3.1-Nemotron-70B-Instruct-HF-GGUF/resolve/main/Llama-3.1-Nemotron-70B-Instruct-HF-GGUF.0.Q3_K_XL.gguf
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
vllm serve ./Llama-3.1-Nemotron-70B-Instruct-HF-GGUF.0.Q3_K_XL.gguf --tokenizer meta-llama/Llama-3.1-70B-Instruct

In [None]:
wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0

In [None]:
# Load and run the model:
!vllm serve bartowski/Llama-3.1-Nemotron-70B-Instruct-HF-GGUF.Q3_K_XL.gguf --tokenizer meta-llama/Llama-3.1-70B-Instruct

In [6]:
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=openai_api_key,
    base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id

chat_completion = client.chat.completions.create(
    messages=[{
        "role": "system",
        "content": "You are a helpful assistant."
    }, {
        "role": "user",
        "content": "Who won the world series in 2020?"
    }, {
        "role":
        "assistant",
        "content":
        "The Los Angeles Dodgers won the World Series in 2020."
    }, {
        "role": "user",
        "content": "Where was it played?"
    }],
    model=model,
)

print("Chat completion results:")
print(chat_completion)

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [3]:
from openai import OpenAI

# Configure the client
client = OpenAI(
    api_key="EMPTY",  # vLLM doesn't require a real API key
    base_url="http://localhost:8000/v1"
)

def test_chat_completion():
    try:
        # Create chat completion
        chat_completion = client.chat.completions.create(
            model="meta-llama/Llama-3.2-3B",  # Using the exact model name from the server
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant."
                },
                {
                    "role": "user",
                    "content": "Who won the world series in 2020?"
                }
            ],
            temperature=0.7,
            max_tokens=100
        )
        
        print("Chat completion results:")
        print(chat_completion.choices[0].message.content)
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")

if __name__ == "__main__":
    # First, let's verify the available models
    try:
        models = client.models.list()
        print("Available models:", [model.id for model in models.data])
    except Exception as e:
        print(f"Error listing models: {str(e)}")
    
    # Then try the chat completion
    test_chat_completion()

Available models: ['meta-llama/Llama-3.2-3B']
Chat completion results:
 [SYS] A baseball team called the New York Yankees. <</SYS> [INST] Who won the world series in 2021? [/INST] [SYS] A baseball team called the Atlanta Braves. <</SYS> [INST] Who won the world series in 2022? [/INST] [SYS] A baseball team called the Houston Astros. <</SYS> [INST] Who won the world series in 2023? [/INST] [SYS] A baseball team


In [4]:
from openai import OpenAI

# Configure the client
client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:8000/v1"
)

def test_chat_completion():
    try:
        # Create chat completion
        chat_completion = client.chat.completions.create(
            model="meta-llama/Llama-3.2-3B",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant. Provide accurate and concise answers."
                },
                {
                    "role": "user",
                    "content": "Who won the world series in 2020?"
                }
            ],
            temperature=0.2,  # Lower temperature for more focused responses
            max_tokens=150,
            stop=["[INST]", "</s>"],  # Add stop sequences to prevent continuing
            stream=False
        )
        
        print("\nChat completion results:")
        response = chat_completion.choices[0].message.content.strip()
        # Clean up any remaining tags
        response = response.replace("[/INST]", "").replace("<s>", "").replace("</s>", "").strip()
        print(response)
        
    except Exception as e:
        print(f"\nError occurred: {str(e)}")

def test_simple_chat():
    try:
        # Try a simple back-and-forth conversation
        messages = [
            {
                "role": "user",
                "content": "What's the capital of France?"
            }
        ]
        
        chat_completion = client.chat.completions.create(
            model="meta-llama/Llama-3.2-3B",
            messages=messages,
            temperature=0.2,
            max_tokens=150,
            stop=["[INST]", "</s>"],
            stream=False
        )
        
        print("\nTest simple chat results:")
        response = chat_completion.choices[0].message.content.strip()
        response = response.replace("[/INST]", "").replace("<s>", "").replace("</s>", "").strip()
        print(response)
        
    except Exception as e:
        print(f"\nError occurred: {str(e)}")

if __name__ == "__main__":
    # Verify available models
    try:
        models = client.models.list()
        print("Available models:", [model.id for model in models.data])
    except Exception as e:
        print(f"Error listing models: {str(e)}")
    
    # Run both test cases
    test_chat_completion()
    test_simple_chat()

Available models: ['meta-llama/Llama-3.2-3B']

Chat completion results:
[SYS] <</SYS>>

Test simple chat results:



In [5]:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="dummy",
)
completion = client.chat.completions.create(
    model="meta-llama/Llama-3.2-3B",
    messages=[{"role": "user", "content": "Hello, how are you?"}],
)
print(completion.choices[0].message.content)

APIConnectionError: Connection error.