In [None]:
import requests
import json


In [None]:
base_url = "https://api.shadeform.ai/v1/instances"
instance_type_url = base_url + "/types"
create_url = base_url + "/create"
headers = {
    "X-API-KEY": "<Add-your-key-here>", 
    "Content-Type" : "application/json"
}
shade_instance_type = "A6000"
gpu_type = "A6000"
num_gpus = 1

params = {
    'gpu_type' : gpu_type,
    'sort' : 'price',
    'available' : True,
    'num_gpus' : num_gpus
}

response = requests.request("GET", instance_type_url, headers=headers, params=params)

#We need to filter out any clouds that don't have the launch configuration supported, at the time of writing that is just runpod
launch_config_not_supported = ["runpod"]

instance_types = json.loads(response.text)["instance_types"]
instance_types = [x for x in instance_types if x['cloud'] not in launch_config_not_supported]
best_instance = None
region = None
if len(instance_types) > 0:
    best_instance = instance_types[0]
    region = best_instance['availability'][0]['region']
    print(f"The cheapest {gpu_type} instance with {num_gpus} gpu(s) is:", best_instance)
else:
    print(f"No instances of type {gpu_type} instance with {num_gpus} gpu(s) found.")

In [None]:
model_id = "mistralai/Mistral-7B-v0.1"
port = 8000

#If the model you need requires authenticated access, paste your Hugging Face api key here
huggingface_token = ""

payload = {
  "cloud": best_instance["cloud"],
  "region": region,
  "shade_instance_type": shade_instance_type,
  "shade_cloud": True,
  "name": "text_generation_inference_server",
  "launch_configuration": {
    "type": "docker", 
    "docker_configuration": {
      "image": "ghcr.io/huggingface/text-generation-inference:1.4",
      "args": "--model-id " + model_id + f" --port {port}",
      "envs": [],
      "port_mappings": [
        {
          "container_port": 8000,
          "host_port": 8000
        }
      ]
    }
  }
}

#Add another environment variable to the payload by adding a json
if huggingface_token != "":
  token_env_json = {
    "name": "HUGGING_FACE_HUB_TOKEN",
    "value" : huggingface_token
  }
  payload["launch_configuration"]["docker_configuration"]["envs"].append(token_env_json)


print(payload)

In [None]:
#request the best instance that is available
response = requests.request("POST", create_url, json=payload, headers=headers)
#easy way to visually see if this request worked
print(response.text)

In [None]:
instance_response = requests.request("GET", base_url, headers=headers)
ip_addr = ""
instance = json.loads(instance_response.text)["instances"][0]
instance_status = instance['status']
if instance_status == 'active':
    print(f"Instance is active with IP: {instance['ip']}")
    ip_addr = instance['ip']
else:
    print(f"Instance isn't yet active: {instance}" )

In [None]:
#Wait until the previous cell has an IP address associated with it, and then add a few minutes for the VLLM server to stand up. 
#It is usually best to look at the logs on the dashboard to tell when the model is loaded.


tgi_headers = {
    'Content-Type': 'application/json',
}

json_data = {
    'model': model_id,
    'prompt': 'New York City is the',
    'max_tokens': 7,
}

completion_response = requests.post(f'http://{ip_addr}:{port}/v1/completions', headers=tgi_headers, json=json_data)

print(completion_response.text)