# Install and Start VLLM Server


1. Go to TIR Dashboard >> Nodes and Create a new Node with H100 Plan 

2. Once Notebook is in running state, open the jupyter labs 

3. Launch a **new terminal** in jupyter labs and run the following commands:

#### Install virtual env

```
sudo apt update
apt install python3.10-venv
```

#### Create virtual env

```
python -m venv vllm_env
```

#### Activate virtual env

```
source vllm/bin/activate
```

#### Install vllm

```
pip install vllm
```

#### Start VLLM server with llama3 

```
vllm serve meta-llama/Meta-Llama-3-8B-Instruct
```



# Test the endpoint
Once the server is up, launch a new notebook (from jupyter labs launch screen) and run the following test


In [None]:

from openai import OpenAI
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="token-abc123",
)

completion = client.chat.completions.create(
  model="meta-llama/Meta-Llama-3-8B-Instruct",
  messages=[
    {"role": "user", "content": "Hello!"}
  ]
)

print(completion.choices[0].message)



# Install dependencies

In [None]:
!pip install pandas
!pip install aiohttp
!pip install openpyxl
!pip install xlrd
!pip install matplotlib

# Imports and basic constants

In [4]:
import asyncio
import time
import aiohttp
import requests
import json
import statistics
import matplotlib.pyplot as plt
import math
import pandas as pd
import os
import ssl

ssl.create_default_context().cert_store_stats()

API_TOKEN = "sdfdsf"

headers = {'Authorization': f'Bearer {API_TOKEN}', 'Content-Type': 'application/json'}
endpoint_url = 'http://localhost:8000/v1/chat/completions'

# Dataset file path and other generation variables

In [100]:
# Number of outut tokens to benchmark
output_tokens = 256
# Number of queries per thread, higher gives more accurate results. max number equels to column. length of dataset
num_queries_per_thread = 20
# dataset Path and column name
file_path = 'prompts.xlsx'  # Change this to the path of your file
column_name = 'prompt_text'  # Change this to the column you want to extract

# Read From Dataset And Convert Request Data 

In [11]:
def read_column(file_path, column_name):
    # Determine the file extension
    _, file_extension = os.path.splitext(file_path)

    # Load the file into a DataFrame based on its extension
    if file_extension == '.xlsx':
        df = pd.read_excel(file_path)
    elif file_extension == '.xls':
        df = pd.read_excel(file_path, engine='xlrd')
    elif file_extension == '.csv':
        df = pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file type. Only .xlsx, .xls, and .csv are supported.")
    
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the file.")
    
    # Extract the column
    column_data = df[column_name]

    return column_data



# Edit System and User Prompts

In [None]:
# edit system prompt as per your use case
system_prompt = "You are an expert in ...."

In [4]:
# edit your user prompt here.  
# The column value from excel sheet (prompts.xls) will be appear here as user_input
def generate_prompt(user_input): 
    return f"""Extract the information from this input in JSON format: {user_input}"""

# Prepare Model Inputs

In [97]:
import math

def get_request_data(file_path, column_name, num_requests, out_tokens, system_prompt):
    target_col = read_column(file_path, column_name)

    prompt_list = []
    
    for i in range(num_requests):
        prompt = {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
            "messages": [
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": generate_prompt(target_col[i])}
                ],
            'temperature': 0.0,
            'max_tokens': out_tokens,
        }
        prompt_list.append(prompt)
    return prompt_list

In [None]:
# test the request generator function 
get_request_data(file_path, column_name, 10, output_tokens, system_prompt)

# For Warmup and validate


In [None]:
# Sends an inital set of warm up requests and validates that we are sending the correct number of input tokens.
def warm_up_and_validate(out_tokens=256, warm_up_requests=10):
    input_data = get_request_data(file_path, column_name, warm_up_requests, out_tokens, system_prompt)
    print("total inputs selected:", len(input_data))
    loop_length = warm_up_requests if warm_up_requests < len(input_data) else len(input_data)
    for i in range(loop_length):
        input_json = json.dumps(input_data[i])
        # print(f"input: {input_json}")
        req = requests.Request('POST', endpoint_url, headers=headers, data=input_json)
        prepped = req.prepare()
        session = requests.Session()
        resp = session.send(prepped)
        result = json.loads(resp.text)
        # print(f"result: {result}")
        print(result['usage']['completion_tokens'])
        print(result['usage']['prompt_tokens'])

warm_up_and_validate(output_tokens)

# Single Worker Execution

In [94]:
latencies = []

async def worker(index, num_requests, out_tokens=256):
    input_data = get_request_data(file_path, column_name, num_requests, out_tokens, system_prompt)
    await asyncio.sleep(0.1 * index)  # Offset the start time of each worker
    loop_length = num_requests if num_requests < len(input_data) else len(input_data)

    for i in range(loop_length):
        input_json = json.dumps(input_data[i])
        request_start_time = time.time()
        success = False
        retries = 0

        while not success and retries < 3:
            try:
                timeout = aiohttp.ClientTimeout(total=3 * 3600)
                connector = aiohttp.TCPConnector(ssl=False)
                async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
                    async with session.post(endpoint_url, headers=headers, data=input_json) as response:
                        response_text = await response.text()
                        # print(f"Response {i} from worker {index}: {response_text}")

                        if response.ok:
                            success = True
                            
                            latency = time.time() - request_start_time
                            result = json.loads(response_text)
                            latencies.append((result['usage']['prompt_tokens'],
                                              result['usage']['completion_tokens'], latency))
                        else:
                            print(f"Error {response.status}: {response_text}")
                            retries += 1
                            await asyncio.sleep(1)  # Backoff before retrying
            except Exception as e:
                print(f"Exception: {e}")
                retries += 1
                await asyncio.sleep(1)  # Backoff before retrying

async def single_benchmark(num_requests_per_worker, num_workers, out_tokens=256):
    tasks = []
    for i in range(num_workers):
        task = asyncio.create_task(worker(i, num_requests_per_worker, out_tokens))
        tasks.append(task)
    await asyncio.gather(*tasks)

# Execute With Parallel Workers

In [91]:
# This runs the benchmark with 1, n/2 and n output tokens to enable deriving time to first token (from 1 output token)
# and the time per token by looking at the difference in latency between 64 and 128 output tokens.
async def benchmark(parallel_queries=1, out_tokens=256, num_tries=5):
    # store statistics about the number of input/outpu and the latency for each setup.
    avg_num_input_tokens = [0, 0, 0]
    avg_num_output_tokens = [0, 0, 0]
    median_latency = [0, 0, 0]
    print(f"Parallel queries {parallel_queries}")
    for i, out_tokens in enumerate([1, out_tokens/2, out_tokens]):
        # Clear the latencies array so that we get fresh statistics.
        latencies.clear()
        await single_benchmark(num_tries, parallel_queries, out_tokens)
        # Compute the median latency and the mean number of tokens.
        avg_num_input_tokens[i] = statistics.mean([inp for inp, _, _ in latencies])
        avg_num_output_tokens[i] = statistics.mean([outp for _, outp, _ in latencies])
        median_latency[i] = statistics.median([latency for _, _, latency in latencies])
        tokens_per_sec = (avg_num_input_tokens[i]+avg_num_output_tokens[i])*parallel_queries/median_latency[i]
        print(f'Avg. Input Tokens {avg_num_input_tokens[i]}, Avg. Output tokens {avg_num_output_tokens[i]}, median latency (s): {round(median_latency[i], 2)}, tokens per second {round(tokens_per_sec, 1)}')
    # We use difference in the time between out_tokens/2 and out_tokens to generate find the time per output token
    # these are stored in median_latency[1] and median_latency[2] respectively
    # We use the time to generate just 1 token to get the time to first token, this is stored in median_latency[0]
    output_token_time = (median_latency[2] - median_latency[1])*1000/(avg_num_output_tokens[2]-avg_num_output_tokens[1])
    print(f'Time to first token (s): {round(median_latency[0],2)}, Time per output token (ms) {round(output_token_time,2)}')
    data.append([median_latency[2],
               (avg_num_input_tokens[2]+avg_num_output_tokens[2])*parallel_queries/median_latency[2]])

# Benchmark And Draw Plot

In [None]:
# This will run until the throughput of the model is no longer increasing by 10%.
data = []
for parallel_queries in [1, 2, 4, 8]:
    request_per_worker = 100
    await benchmark(parallel_queries, output_tokens, request_per_worker)
    # Break if the throughput doesn't increase by more than 10%
    if len(data) > 1 and (data[-1][1] - data[-2][1])/data[-2][1] < 0.1:
        break

# Plot the latency vs throughput curve
plt.xlabel("Latency (s)")
plt.ylabel("Throughput (tok/s)")
line = plt.plot([x[0] for x in data], [x[1] for x in data], marker='o')
plt.show()