<a href="https://colab.research.google.com/github/soodaryan/LLM_inference_optimization/blob/main/LLM_inference_opti_with_Gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch transformers accelerate datasets peft gradio
!pip install bitsandbytes flash_attn
!pip install git+https://github.com/casper-hansen/AutoAWQ.git

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.114.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-a

In [2]:
import time
import torch
import argparse
import numpy as np
import os
import pandas as pd
import psutil
from peft import PeftModel
from awq import AutoAWQForCausalLM
from awq.models.base import BaseAWQForCausalLM
from awq.utils.utils import get_best_device, ipex_available
from transformers import AutoTokenizer, GenerationConfig, LogitsProcessor, LogitsProcessorList


In [3]:
from huggingface_hub import login
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')
login(token = HF_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
DEVICE = ("cuda" if torch.cuda.is_available() else "cpu")
print("device :", DEVICE)

device : cuda


In [5]:
if DEVICE == "cpu":
    if ipex_available:
        torch_dtype = torch.bfloat16
    else:
        raise ImportError("Please import intel_extension_for_pytorch "
                          "by `pip install intel_extension_for_pytorch`")
else:
    torch_dtype = torch.float16

print("initialized dtype :",torch_dtype) # note to self

initialized dtype : torch.float16


In [6]:
class TimeMeasuringLogitsProcessor(LogitsProcessor):
    def __init__(self):
        self.token_times = [time.time()]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        """The logit processor is called after the model forward."""

        # cuda runs async operates, so we synchronize for accurate time measurement
        if DEVICE != "cpu":
            torch.cuda.synchronize()

        # measure time
        start_time = time.time()
        self.token_times.append(start_time)
        return scores

    def get_prefill_duration(self):
        return self.token_times[1] - self.token_times[0]

    def get_decode_durations(self):
        token_times = self.token_times[1:]
        token_durations = [token_times[i + 1] - token_times[i] for i in range(len(token_times) - 1)]

        return token_durations

def warmup(model):
    warm_up = torch.randn((4096,4096)).to(next(model.parameters()).device)
    torch.mm(warm_up,warm_up)

def generate_torch(model, input_ids, n_generate):
    context_time = 0
    generate_time = []

    with torch.inference_mode():
        for i in range(n_generate):
            torch.cuda.synchronize()
            start = time.time()

            if i == 0:
                # prefill context
                inputs = torch.as_tensor(input_ids, device=next(model.parameters()).device)
            else:
                # decode tokens
                inputs = torch.as_tensor(token, device=next(model.parameters()).device)

            out = model(inputs, use_cache=True)

            if DEVICE != "cpu":
                torch.cuda.synchronize()
            token = out[0][:, -1].max(1)[1].unsqueeze(1)

            if i == 0:
                context_time += time.time() - start
            else:
                generate_time.append(time.time() - start)

    return context_time, generate_time

def generate_hf(model: BaseAWQForCausalLM, input_ids, n_generate):
    generation_config = GenerationConfig(
        min_new_tokens=n_generate,
        max_new_tokens=n_generate,
        use_cache=True,
        forced_eos_token_id=1,
        eos_token_id=1,
    )

    time_processor = TimeMeasuringLogitsProcessor()

    model.generate(
        input_ids,
        generation_config=generation_config,
        logits_processor=LogitsProcessorList([time_processor]),
    )

    context_time = time_processor.get_prefill_duration()
    generate_time = time_processor.get_decode_durations()

    return context_time, generate_time

def run_round(generator, model_path, quant_file, n_generate, input_ids, batch_size, no_safetensors, pretrained):
    print(f" -- Loading model...")
    if pretrained:
        model = AutoAWQForCausalLM.from_pretrained(
            model_path,
            safetensors=not no_safetensors,
            device_map=DEVICE,
            torch_dtype=torch_dtype,
        )


    else:
        model = AutoAWQForCausalLM.from_quantized(
            model_path, quant_file, max_seq_len=n_generate, batch_size=batch_size, safetensors=not no_safetensors
        )

    print(f" -- Warming up...")
    warmup(model)

    print(f" -- Generating {n_generate} tokens, {input_ids.shape[1]} in context...")

    try:
        context_time, generate_time = generator(model, input_ids, n_generate)
        successful_generate = True
    except RuntimeError as ex:
        if 'out of memory' in str(ex).lower():
            successful_generate = False
        else:
            raise RuntimeError(ex)

    total_memory_used = 0
    memory_pct = 100
    if successful_generate:
        # number of tokens in context / time for processing context * batch size
        prefill_tokens_per_second = round(input_ids.shape[1] / context_time * batch_size, 2)
        # 1 second / median time per token in seconds * batch size
        decode_tokens_per_second = round(1 / np.median(generate_time) * batch_size, 2)

        print(f" ** Speed (Prefill): {prefill_tokens_per_second:.2f} tokens/second")
        print(f" ** Speed (Decode): {decode_tokens_per_second:.2f} tokens/second")

        # Calculate total tokens and total time
        total_tokens = input_ids.shape[1] + n_generate
        total_time = context_time + sum(generate_time)

        total_throughput = round(total_tokens / total_time * batch_size, 2)
        print(f" ** Total Throughput: {total_throughput:.2f} tokens/second")

        for device in range(torch.cuda.device_count()):
          memory_used = torch.cuda.max_memory_allocated(device) / (1024 ** 3)
          total_memory_used += memory_used
          memory_pct = memory_used / (torch.cuda.get_device_properties(device).total_memory / (1024 ** 3)) * 100
          print(f" ** Max Memory (device: {device}): {memory_used:.2f} GB ({memory_pct:.2f}%)")
    else:
        prefill_tokens_per_second = 'OOM'
        decode_tokens_per_second = 'OOM'
        total_tokens = 0
        total_time = 0
        total_throughput = 0

    if pretrained:
        version = "FP16" if DEVICE != "cpu" else "BF16"
    else:
        version = model.quant_config.version

    return {
        "Batch Size": batch_size,
        "Prefill Length": input_ids.shape[1],
        "Decode Length": n_generate,
        "Prefill tokens/s": prefill_tokens_per_second,
        "Decode tokens/s": decode_tokens_per_second,
        "Total tokens" : total_tokens * batch_size,
        "Total time" : total_time,
        "Total Throughput": total_throughput,
        "Memory (VRAM)": f"{total_memory_used:.2f} GB ({memory_pct:.2f}%)"
    }, version

In [7]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/lora

Mounted at /content/drive
/content/drive/MyDrive/lora


In [9]:
def run_round_lora(generator, model_path, peft_model_dir, quant_file, n_generate, input_ids, batch_size, no_safetensors, pretrained):

    model = AutoModelForCausalLM.from_pretrained(model_path).to(DEVICE)

    model = PeftModel.from_pretrained(model, peft_model_dir)

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token

    print(f" -- Warming up...")
    warmup(model)

    print(f" -- Generating {n_generate} tokens, {input_ids.shape[1]} in context...")

    try:
        context_time, generate_time = generator(model, input_ids, n_generate)
        successful_generate = True
    except RuntimeError as ex:
        if 'out of memory' in str(ex).lower():
            successful_generate = False
        else:
            raise RuntimeError(ex)

    total_memory_used = 0
    memory_pct = 100
    if successful_generate:
        # number of tokens in context / time for processing context * batch size
        prefill_tokens_per_second = round(input_ids.shape[1] / context_time * batch_size, 2)
        # 1 second / median time per token in seconds * batch size
        decode_tokens_per_second = round(1 / np.median(generate_time) * batch_size, 2)

        print(f" ** Speed (Prefill): {prefill_tokens_per_second:.2f} tokens/second")
        print(f" ** Speed (Decode): {decode_tokens_per_second:.2f} tokens/second")

        # Calculate total tokens and total time
        total_tokens = input_ids.shape[1] + n_generate
        total_time = context_time + sum(generate_time)

        total_throughput = round(total_tokens / total_time * batch_size, 2)
        print(f" ** Total Throughput: {total_throughput:.2f} tokens/second")

        for device in range(torch.cuda.device_count()):
          memory_used = torch.cuda.max_memory_allocated(device) / (1024 ** 3)
          total_memory_used += memory_used
          memory_pct = memory_used / (torch.cuda.get_device_properties(device).total_memory / (1024 ** 3)) * 100
          print(f" ** Max Memory (device: {device}): {memory_used:.2f} GB ({memory_pct:.2f}%)")
    else:
        prefill_tokens_per_second = 'OOM'
        decode_tokens_per_second = 'OOM'
        total_tokens = 0
        total_time = 0
        total_throughput = 0

    return {
        "Batch Size": batch_size,
        "Prefill Length": input_ids.shape[1],
        "Decode Length": n_generate,
        "Prefill tokens/s": prefill_tokens_per_second,
        "Decode tokens/s": decode_tokens_per_second,
        "Total tokens" : total_tokens * batch_size,
        "Total time" : total_time,
        "Total Throughput": total_throughput,
        "Memory (VRAM)": f"{total_memory_used:.2f} GB ({memory_pct:.2f}%)"
    }

In [10]:
def run_round(generator, model_path, quant_file, n_generate, input_ids, batch_size, no_safetensors, pretrained):
    print(f" -- Loading model...")
    if pretrained:
        model = AutoAWQForCausalLM.from_pretrained(
            model_path,
            safetensors=not no_safetensors,
            device_map=DEVICE,
            torch_dtype=torch_dtype,
        )


    else:
        model = AutoAWQForCausalLM.from_quantized(
            model_path, quant_file, max_seq_len=n_generate, batch_size=batch_size, safetensors=not no_safetensors
        )

    print(f" -- Warming up...")
    warmup(model)

    print(f" -- Generating {n_generate} tokens, {input_ids.shape[1]} in context...")

    try:
        context_time, generate_time = generator(model, input_ids, n_generate)
        successful_generate = True
    except RuntimeError as ex:
        if 'out of memory' in str(ex).lower():
            successful_generate = False
        else:
            raise RuntimeError(ex)

    total_memory_used = 0
    memory_pct = 100
    if successful_generate:
        # number of tokens in context / time for processing context * batch size
        prefill_tokens_per_second = round(input_ids.shape[1] / context_time * batch_size, 2)
        # 1 second / median time per token in seconds * batch size
        decode_tokens_per_second = round(1 / np.median(generate_time) * batch_size, 2)

        print(f" ** Speed (Prefill): {prefill_tokens_per_second:.2f} tokens/second")
        print(f" ** Speed (Decode): {decode_tokens_per_second:.2f} tokens/second")

        # Calculate total tokens and total time
        total_tokens = input_ids.shape[1] + n_generate
        total_time = context_time + sum(generate_time)

        total_throughput = round(total_tokens / total_time * batch_size, 2)
        print(f" ** Total Throughput: {total_throughput:.2f} tokens/second")

        for device in range(torch.cuda.device_count()):
          memory_used = torch.cuda.max_memory_allocated(device) / (1024 ** 3)
          total_memory_used += memory_used
          memory_pct = memory_used / (torch.cuda.get_device_properties(device).total_memory / (1024 ** 3)) * 100
          print(f" ** Max Memory (device: {device}): {memory_used:.2f} GB ({memory_pct:.2f}%)")
    else:
        prefill_tokens_per_second = 'OOM'
        decode_tokens_per_second = 'OOM'
        total_tokens = 0
        total_time = 0
        total_throughput = 0

    if pretrained:
        version = "FP16" if DEVICE != "cpu" else "BF16"
    else:
        version = model.quant_config.version

    return {
        "Batch Size": batch_size,
        "Prefill Length": input_ids.shape[1],
        "Decode Length": n_generate,
        "Prefill tokens/s": prefill_tokens_per_second,
        "Decode tokens/s": decode_tokens_per_second,
        "Total tokens" : total_tokens * batch_size,
        "Total time" : total_time,
        "Total Throughput": total_throughput,
        "Memory (VRAM)": f"{total_memory_used:.2f} GB ({memory_pct:.2f}%)"
    }, version

In [11]:
def no_lora(model_path = "casperhansen/mistral-7b-instruct-v0.1-awq",
     quant_file = "",
     batch_size = 1,
     no_safetensors = False,
     generator = "torch",
     pretrained = False):

    rounds = [
        {"context": 128, "n_generate": 128}
    ]

    if generator == "torch":
        generator = generate_torch
    elif generator == "hf":
        generator = generate_hf
    else:
        raise ValueError(f"Unknown generator method passed: {generator}")

    all_stats = []
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    for settings in rounds:
        input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, settings["context"]))
        if DEVICE != "cpu":
            input_ids = input_ids.cuda()

        stats, model_version = run_round(
            generator,
            model_path,
            quant_file,
            settings["n_generate"],
            input_ids,
            batch_size,
            no_safetensors,
            pretrained
        )

        all_stats.append(stats)

        if stats["Prefill tokens/s"] == 'OOM':
            break

    df = pd.DataFrame(all_stats)
    print('Device:', DEVICE)
    if DEVICE != "cpu":
        print('GPU:', torch.cuda.get_device_name())
    print('Model:', model_path)
    print('Version:', model_version)
    print(df.drop(columns = ["Prefill tokens/s", "Decode tokens/s"]).to_markdown(index=False))
    return df

In [12]:
def lora_function(model_path,
         peft_model_dir = "./output",
         quant_file = "",
         batch_size = 1,
         no_safetensors = False,
         generator = "torch",
         pretrained = False):

    rounds = [
        {"context": 128, "n_generate": 128},
       ]

    if generator == "torch":
        generator = generate_torch
    elif generator == "hf":
        generator = generate_hf
    else:
        raise ValueError(f"Unknown generator method passed: {generator}")

    all_stats = []

    for settings in rounds:
        input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, settings["context"]))
        if DEVICE != "cpu":
            input_ids = input_ids.cuda()

        stats = run_round_lora(
            generator,
            model_path,
            peft_model_dir,
            quant_file,
            settings["n_generate"],
            input_ids,
            batch_size,
            no_safetensors,
            pretrained
        )

        all_stats.append(stats)

        if stats["Prefill tokens/s"] == 'OOM':
            break

    df = pd.DataFrame(all_stats)
    print('Device:', DEVICE)
    if DEVICE != "cpu":
        print('GPU:', torch.cuda.get_device_name())
    print('Model:', model_path)
    print(df.drop(columns = ["Prefill tokens/s", "Decode tokens/s"]).to_markdown(index=False))
    return df

In [13]:
# model_name =  "TheBloke/Mistral-7B-v0.1-AWQ"
# lora_function(model_path = model_name,
#      peft_model_dir = "./output",
#      quant_file = "",
#      batch_size = 32,
#      no_safetensors = True,
#      generator = "torch",
#      pretrained = False)

In [None]:
import datasets
from awq import AutoAWQForCausalLM
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import get_peft_model, LoraConfig, TaskType

def prepare_split(tokenizer):
    data = datasets.load_dataset("mhenrichsen/alpaca_2k_test", split="train")
    prompt_template = "<s>[INST] {prompt} [/INST] {output}</s>"

    def format_prompt(x):
        return prompt_template.format(
            prompt=x["instruction"],
            output=x["output"]
        )

    data = data.map(
        lambda x: {"text": format_prompt(x)},
    ).select_columns(["text"])
    data = data.map(lambda x: tokenizer(x["text"]), batched=True)

    return data

model_path = "TheBloke/Mistral-7B-v0.1-AWQ"

# Load model
model = AutoAWQForCausalLM.from_quantized(model_path, fuse_layers=False)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

# Prepare data
data_train = prepare_split(tokenizer)

# Config Lora
lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.5,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False
)

model = get_peft_model(model.model, lora_config)

model.print_trainable_parameters()

training_arguments = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=1,
    optim="adamw_torch",
    num_train_epochs=1,
    learning_rate=1e-4,
    evaluation_strategy="no",
    save_strategy="epoch",
    save_steps=100,
    logging_steps=50,
    eval_steps=None,
    load_best_model_at_end=False
)

trainer = Trainer(
    model=model,
    train_dataset=data_train,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()
trainer.save_model("output")

In [18]:
import gradio as gr
import pandas as pd

# Placeholder function for non-LoRA model with PyTorch
def no_lora_pytorch():
    model_name =  "casperhansen/mistral-7b-instruct-v0.1-awq"
    df = no_lora(model_path = model_name,
        quant_file = "",
        batch_size = 32,
        no_safetensors = True,
        generator = "torch",
        pretrained = False)
    return df

# Placeholder function for non-LoRA model with Hugging Face
def no_lora_huggingface():
    model_name =  "casperhansen/mistral-7b-instruct-v0.1-awq"
    df = no_lora(model_path = model_name,
        quant_file = "",
        batch_size = 32,
        no_safetensors = True,
        generator = "hf",
        pretrained = False)
    return df

# Placeholder function for LoRA model with PyTorch
def lora_function_pytorch(train_or_input, adapter_name=None):
    model_name = "TheBloke/Mistral-7B-v0.1-AWQ"
    df = lora_function(model_path = model_name,
                  peft_model_dir = "./output",
                  quant_file = "",
                  batch_size = 32,
                  no_safetensors = True,
                  generator = "torch",
                  pretrained = False
    )
    return df

# Placeholder function for LoRA model with Hugging Face
def lora_function_huggingface(train_or_input, adapter_name=None):
    model_name = "TheBloke/Mistral-7B-v0.1-AWQ"
    df = lora_function(model_path = model_name,
                  peft_model_dir = "./output",
                  quant_file = "",
                  batch_size = 32,
                  no_safetensors = True,
                  generator = "hf",
                  pretrained = False
    )
    return df

def process_input(is_lora, model_name, lora_option, adapter_name, inference_type):
    if not is_lora:
        if inference_type == "PyTorch":
            return no_lora_pytorch()
        else:
            return no_lora_huggingface()
    else:
        if inference_type == "PyTorch":
            return lora_function_pytorch(lora_option, adapter_name)
        else:
            return lora_function_huggingface(lora_option, adapter_name)

# Define the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Model Selection, LoRA Configuration, and Inference Type")

    with gr.Row():
        is_lora = gr.Checkbox(label="Use LoRA")
        model_name = gr.Dropdown(["MISTRAL"], label="Model Name")
        inference_type = gr.Radio(["PyTorch", "Hugging Face"], label="Inference Type")

    with gr.Row():
        lora_option = gr.Radio(["Train LoRA", "Input LoRA Adapter"], label="LoRA Option", visible=False)
        adapter_name = gr.Textbox(label="LoRA Adapter File Name", visible=False)

    output = gr.Dataframe(label="Model Details")

    submit_btn = gr.Button("Submit")

    def update_lora_visibility(is_lora):
        return {
            lora_option: gr.update(visible=is_lora),
            adapter_name: gr.update(visible=is_lora and lora_option.value == "Input LoRA Adapter")
        }

    def update_adapter_visibility(option):
        return gr.update(visible=option == "Input LoRA Adapter")

    is_lora.change(update_lora_visibility, is_lora, [lora_option, adapter_name])
    lora_option.change(update_adapter_visibility, lora_option, adapter_name)

    submit_btn.click(
        process_input,
        inputs=[is_lora, model_name, lora_option, adapter_name, inference_type],
        outputs=output
    )

# Launch the interface
demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://9abe0adda0eefbbfbd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


tokenizer_config.json:   0%|          | 0.00/915 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

 -- Loading model...


config.json:   0%|          | 0.00/859 [00:00<?, ?B/s]

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

quant_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

Replacing layers...: 100%|██████████| 32/32 [00:14<00:00,  2.15it/s]
  return torch.load(checkpoint_file, map_location=torch.device("cpu"))


 -- Warming up...
 -- Generating 128 tokens, 128 in context...


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


 ** Speed (Prefill): 448.99 tokens/second
 ** Speed (Decode): 334.89 tokens/second
 ** Total Throughput: 377.55 tokens/second
 ** Max Memory (device: 0): 5.14 GB (34.83%)
Device: cuda
GPU: Tesla T4
Model: casperhansen/mistral-7b-instruct-v0.1-awq
Version: gemm
|   Batch Size |   Prefill Length |   Decode Length |   Total tokens |   Total time |   Total Throughput | Memory (VRAM)    |
|-------------:|-----------------:|----------------:|---------------:|-------------:|-------------------:|:-----------------|
|           32 |              128 |             128 |           8192 |      21.6979 |             377.55 | 5.14 GB (34.83%) |
 -- Loading model...


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:11<00:00,  2.82it/s]
  return torch.load(checkpoint_file, map_location=torch.device("cpu"))


 -- Warming up...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 -- Generating 128 tokens, 128 in context...
 ** Speed (Prefill): 1082.96 tokens/second
 ** Speed (Decode): 207.48 tokens/second
 ** Total Throughput: 348.76 tokens/second
 ** Max Memory (device: 0): 5.14 GB (34.83%)
Device: cuda
GPU: Tesla T4
Model: casperhansen/mistral-7b-instruct-v0.1-awq
Version: gemm
|   Batch Size |   Prefill Length |   Decode Length |   Total tokens |   Total time |   Total Throughput | Memory (VRAM)    |
|-------------:|-----------------:|----------------:|---------------:|-------------:|-------------------:|:-----------------|
|           32 |              128 |             128 |           8192 |      23.4891 |             348.76 | 5.14 GB (34.83%) |
 -- Loading model...


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:11<00:00,  2.88it/s]
  return torch.load(checkpoint_file, map_location=torch.device("cpu"))


 -- Warming up...
 -- Generating 128 tokens, 128 in context...
 ** Speed (Prefill): 1080.17 tokens/second
 ** Speed (Decode): 288.62 tokens/second
 ** Total Throughput: 459.35 tokens/second
 ** Max Memory (device: 0): 5.14 GB (34.83%)
Device: cuda
GPU: Tesla T4
Model: casperhansen/mistral-7b-instruct-v0.1-awq
Version: gemm
|   Batch Size |   Prefill Length |   Decode Length |   Total tokens |   Total time |   Total Throughput | Memory (VRAM)    |
|-------------:|-----------------:|----------------:|---------------:|-------------:|-------------------:|:-----------------|
|           32 |              128 |             128 |           8192 |       17.834 |             459.35 | 5.14 GB (34.83%) |
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://9abe0adda0eefbbfbd.gradio.live


