In this notebook we will find and deploy the most affordable GPU's on the market with the Shadeform API, deploy a model serving framework called VLLM to serve Mistral. We limit our search to machines with 1xA6000 machines, but that is easily configurable below.

This Notebook re-uses code from [find_and_use_gpus.ipynb](https://github.com/shadeform/examples/blob/main/find_and_use_gpus.ipynb) for using the Shadeform API to find available instances, and leverages code from [deploy_container.ipynb](https://github.com/shadeform/examples/blob/main/deploy_container.ipynb)

In [None]:
import requests
import json

In [None]:
base_url = "https://api.shadeform.ai/v1/instances"
instance_type_url = base_url + "/types"
create_url = base_url + "/create"
headers = {
    "X-API-KEY": "<Add-your-key-here>", 
    "Content-Type" : "application/json"
}
shade_instance_type = "A6000"
gpu_type = "A6000"
num_gpus = 1

params = {
    'gpu_type' : gpu_type,
    'sort' : 'price',
    'available' : True,
    'num_gpus' : num_gpus
}

response = requests.request("GET", instance_type_url, headers=headers, params=params)
instance_types = json.loads(response.text)["instance_types"]
best_instance = None
region = None
if len(instance_types) > 0:
    best_instance = instance_types[0]
    region = best_instance['availability'][0]['region']
    print(f"The cheapest {gpu_type} instance with {num_gpus} gpu(s) is:", best_instance)
else:
    print(f"No instances of type {gpu_type} instance with {num_gpus} gpu(s) found.")

In [None]:
#If the model you need requires authenticated access, paste your key here
huggingface_token = ""

model_id = "mistralai/Mistral-7B-v0.1"

payload = {
  "cloud": best_instance["cloud"],
  "region": region,
  "shade_instance_type": shade_instance_type,
  "shade_cloud": True,
  "name": "cool_gpu_server",
  "launch_configuration": {
    "type": "docker",
     #This selects the image to launch, and sets environment variables "tasks" and "num_fewshot"
    "docker_configuration": {
      "image": "vllm/vllm-openai:latest",
      "args": "--model " + model_id,
      "envs": [],
      "port_mappings": [
        {
          "container_port": 8000,
          "host_port": 8000
        }
      ]
    }
  }
}

#Add another environment variable to the payload by adding a json
if huggingface_token != "":
  token_env_json = {
    "name": "HUGGING_FACE_HUB_TOKEN",
    "value" : huggingface_token
  }
  payload["launch_configuration"]["docker_configuration"]["envs"].append(token_env_json)

print(payload)

In [None]:
#request the best instance that is available
response = requests.request("POST", create_url, json=payload, headers=headers)
#easy way to visually see if this request worked
print(response.text)

In [None]:
instance_response = requests.request("GET", base_url, headers=headers)
ip_addr = ""
print(instance_response.text)
instance = json.loads(instance_response.text)["instances"][0]
instance_status = instance['status']
if instance_status == 'active':
    print(f"Instance is active with IP: {instance['ip']}")
    ip_addr = instance['ip']
else:
    print(f"Instance isn't yet active: {instance}" )

In [None]:
#Wait until the previous cell has an IP address associated with it, and then add a few minutes for the VLLM server to stand up. 
#It is usually best to look at the logs on the dashboard to tell when the model is loaded.

model_list_response = requests.get(f'http://{ip_addr}:8000/v1/models')
print(model_list_response.text)

vllm_headers = {
    'Content-Type': 'application/json',
}

json_data = {
    'model': model_id,
    'prompt': 'San Francisco is a',
    'max_tokens': 7,
    'temperature': 0,
}

completion_response = requests.post(f'http://{ip_addr}:8000/v1/completions', headers=vllm_headers, json=json_data)

print(completion_response.text)

In [None]:
#Alternatively, you can call this with the Open AI library, but requires that to be downloaded
from openai import OpenAI

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = f"http://{ip_addr}:8000/v1"
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
completion = client.completions.create(model=model_id,
                                      prompt="San Francisco is a")
print("Completion result:", completion)

