<a href="https://colab.research.google.com/github/ruslanmv/Open-Source-LLM-Chatbot/blob/master/Benchmarks_Open_Source_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Colab Pro notebook from https://github.com/ruslanmv/Open-Source-LLM-Chatbot**

# Benchmark Open Source LLM **Chatbot**


In [1]:
#@markdown # Connect Google Drive
from google.colab import drive
from IPython.display import clear_output
import ipywidgets as widgets
import os

def inf(msg, style, wdth): inf = widgets.Button(description=msg, disabled=True, button_style=style, layout=widgets.Layout(min_width=wdth));display(inf)
Shared_Drive = "" #@param {type:"string"}
#@markdown - Leave empty if you're not using a shared drive

print("[0;33mConnecting...")
drive.mount('/content/gdrive')

if Shared_Drive!="" and os.path.exists("/content/gdrive/Shareddrives"):
  mainpth="Shareddrives/"+Shared_Drive
else:
  mainpth="MyDrive"

clear_output()
inf('\u2714 Done','success', '50px')

#@markdown ---

Button(button_style='success', description='✔ Done', disabled=True, layout=Layout(min_width='50px'), style=But…

In [2]:
#@markdown # Install/Update Open Source LLM Chatbot repo
from IPython.utils import capture
from IPython.display import clear_output
from subprocess import getoutput
import ipywidgets as widgets
import sys
import fileinput
import os
import time
import base64
import gdown
from gdown.download import get_url_from_gdrive_confirmation
import requests
from urllib.request import urlopen, Request
from urllib.parse import urlparse, parse_qs, unquote
from tqdm import tqdm
import six
if not os.path.exists("/content/gdrive"):
  print('[1;31mGdrive not connected, using temporary colab storage ...')
  time.sleep(4)
  mainpth="MyDrive"
  !mkdir -p /content/gdrive/$mainpth
  Shared_Drive=""

if Shared_Drive!="" and not os.path.exists("/content/gdrive/Shareddrives"):
  print('[1;31mShared drive not detected, using default MyDrive')
  mainpth="MyDrive"

with capture.capture_output() as cap:
  def inf(msg, style, wdth): inf = widgets.Button(description=msg, disabled=True, button_style=style, layout=widgets.Layout(min_width=wdth));display(inf)
  fgitclone = "git clone --depth 1"
  %mkdir -p /content/gdrive/$mainpth/llm
  %cd /content/gdrive/$mainpth/llm

  !git clone -q --branch master https://github.com/ruslanmv/Open-Source-LLM-Chatbot.git /content/gdrive/$mainpth/llm/chatbot/
  !git fetch
  !git pull
  !mkdir -p /content/gdrive/$mainpth/llm/chatbot/cache/
  !pip install -r /content/gdrive/$mainpth/llm/chatbot/requirements.txt
  !pip install memory-profiler pyngrok
  os.environ['TRANSFORMERS_CACHE']=f"/content/gdrive/{mainpth}/llm/chatbot/cache"
  os.environ['TORCH_HOME'] = f"/content/gdrive/{mainpth}/llm/chatbot/cache"

with capture.capture_output() as cap:
  %cd /content/gdrive/$mainpth/llm/chatbot/
clear_output()
inf('\u2714 Done','success', '50px')

#@markdown ---

Button(button_style='success', description='✔ Done', disabled=True, layout=Layout(min_width='50px'), style=But…

In [3]:
#@markdown # Model to Download/Load and Benchmark
from huggingface_hub import hf_hub_download
Use_Temp_Storage = False #@param {type:"boolean"}
#@markdown - If not, make sure you have enough space on your gdrive

#@markdown ---

Model_Version = "Mistral-7B-Instruct-v0.2" #@param ["Mistral-7B-Instruct-v0.2", "CodeLlama-7B", "Llama-2-13B-chat", "Falcon-7B-Instruct","zephyr-7B-beta","vicuna-7B-v1.5"]

if Use_Temp_Storage:
   MODELS_PATH = "./models"
else:
    MODELS_PATH ='/content/gdrive/'+mainpth+'/llm/chatbot'+'/models/'

def llmdl(ver, Use_Temp_Storage):

  repo_id=''
  if(ver=="Llama-2-13B-chat"):
        repo_id="TheBloke/Llama-2-13B-chat-GGUF"
        filename="llama-2-13b-chat.Q4_K_M.gguf"
  elif(ver=="Mistral-7B-Instruct-v0.2") :
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
        filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf"
  elif(ver=="zephyr-7B-beta"):
        repo_id="TheBloke/zephyr-7B-beta-GGUF"
        filename="zephyr-7b-beta.Q4_K_M.gguf"
  elif(ver=="vicuna-7B-v1.5"):
        repo_id="TheBloke/vicuna-7B-v1.5-GGUF"
        filename="vicuna-7b-v1.5.Q4_K_M.gguf"
  elif(ver=="Falcon-7B-Instruct"):
        repo_id="TheBloke/Falcon-7B-Instruct-GGML"
        filename="falcon-7b-instruct.ggccv1.q4_1.bin"
  elif(ver=="CodeLlama-7B"):
        repo_id="TheBloke/CodeLlama-7B-GGUF"
        filename="codellama-7b.Q4_K_M.gguf"
  if Use_Temp_Storage:
      os.makedirs('/content/models', exist_ok=True)
      model='/content/models/'+ver
  else:
      model=MODELS_PATH+ver
  link=repo_id
  if not os.path.exists(model):
    model_path = hf_hub_download(
    repo_id= repo_id,
    filename= filename,
    resume_download=True,
    cache_dir=MODELS_PATH,)

    if os.path.exists(model_path):
      clear_output()
      inf('\u2714 Done','success', '50px')
    else:
      inf('\u2718 Something went wrong, try again','danger', "250px")
  else:
      clear_output()
      inf('\u2714 Model already exists','primary', '300px')

  return model

inf('\u2718 Downloading files','success', "400px")
PATH_to_MODEL=llmdl(Model_Version, Use_Temp_Storage)

if os.path.exists(str(PATH_to_MODEL)):
  inf('\u2714 Using the custom model.','success', '200px')
  model=PATH_to_MODEL

#@markdown ---

Button(button_style='success', description='✔ Done', disabled=True, layout=Layout(min_width='50px'), style=But…

In [4]:
#@markdown # Start Benchmark
from IPython.utils import capture
import time
import sys
import fileinput
from pyngrok import ngrok, conf
import re

Use_Cloudflare_Tunnel = False #@param {type:"boolean"}
#@markdown - Offers better gradio responsivity

Ngrok_token = "" #@param {type:"string"}

#@markdown - Input your ngrok token if you want to use ngrok server

User = "" #@param {type:"string"}
Password= "" #@param {type:"string"}
#@markdown - Add credentials to your Gradio interface (optional)

auth=f"--gradio-auth {User}:{Password}"
if User =="" or Password=="":
  auth=""


#with capture.capture_output() as cap:
#  %cd /content/gdrive/$mainpth/llm/chatbot/modules/

share=''
if Ngrok_token!="":
  ngrok.kill()
  srv=ngrok.connect(7860, pyngrok_config=conf.PyngrokConfig(auth_token=Ngrok_token) , bind_tls=True).public_url

  for line in fileinput.input('/usr/local/lib/python3.10/dist-packages/gradio/blocks.py', inplace=True):
    if line.strip().startswith('self.server_name ='):
        line = f'            self.server_name = "{srv[8:]}"\n'
    if line.strip().startswith('self.protocol = "https"'):
        line = '            self.protocol = "https"\n'
    if line.strip().startswith('if self.local_url.startswith("https") or self.is_colab'):
        line = ''
    if line.strip().startswith('else "http"'):
        line = ''
    sys.stdout.write(line)

elif Use_Cloudflare_Tunnel:
  with capture.capture_output() as cap:
    !pkill cloudflared
    time.sleep(4)
    !nohup cloudflared tunnel --url http://localhost:7860 > /content/srv.txt 2>&1 &
    time.sleep(4)
    with open('/content/srv.txt', "r") as file: text = file.read()
    srv= re.findall(r"https?://(?:\S+?\.)?trycloudflare\.com\S*", text)[0]

    for line in fileinput.input('/usr/local/lib/python3.10/dist-packages/gradio/blocks.py', inplace=True):
      if line.strip().startswith('self.server_name ='):
          line = f'            self.server_name = "{srv[8:]}"\n'
      if line.strip().startswith('self.protocol = "https"'):
          line = '            self.protocol = "https"\n'
      if line.strip().startswith('if self.local_url.startswith("https") or self.is_colab'):
          line = ''
      if line.strip().startswith('else "http"'):
          line = ''
      sys.stdout.write(line)

    !rm /content/srv.txt

else:
  share='--share'


#@markdown ---

# Loading Single Model

In [5]:
print(MODELS_PATH)

/content/gdrive/MyDrive/llm/chatbot/models/


In [11]:
from huggingface_hub import hf_hub_download
import logging
import sys
import gradio as gr
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

from huggingface_hub import hf_hub_download
from llama_index.llms import LlamaCPP
from nltk.translate.bleu_score import sentence_bleu
import psutil
from memory_profiler import memory_usage
import time

def predict(message,llm):
    response = llm.stream_complete(message)
    for bot_response in response:
        token = bot_response.delta
        yield token

def ask(message,model_name):
    llm=get_llm(model_name)
    answer = list(predict(message,llm))
    print(' '.join(answer))

def output_quality_bleu(predicted_response, reference_response):
    predicted_tokens = predicted_response.split()
    reference_tokens = reference_response.split()
    score = sentence_bleu([reference_tokens], predicted_tokens)
    return score

def wrapper_predict(message, llm):
    list(predict(message, llm))

def wrapper_ask(message, model_name):
  ask(message, model_name)

def get_llm(model, set_gpu=False):
    if set_gpu:
       gpu=1
    else:
      gpu=-1
    repo_id=""
    filename=""
    if(model=="Llama-2-13B-chat"):
      repo_id="TheBloke/Llama-2-13B-chat-GGUF"
      filename="llama-2-13b-chat.Q4_K_M.gguf"
    elif(model=="Mistral-7B-Instruct-v0.2") :
      repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
      filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf"
    elif(model=="zephyr-7B-beta"):
      repo_id="TheBloke/zephyr-7B-beta-GGUF"
      filename="zephyr-7b-beta.Q4_K_M.gguf"
    elif(model=="vicuna-7B-v1.5"):
      repo_id="TheBloke/vicuna-7B-v1.5-GGUF"
      filename="vicuna-7b-v1.5.Q4_K_M.gguf"
    elif(model=="Falcon-7B-Instruct"):
      repo_id="TheBloke/Falcon-7B-Instruct-GGML"
      filename="falcon-7b-instruct.ggccv1.q4_0.bin"
    elif(model=="CodeLlama-7B"):
      repo_id="TheBloke/CodeLlama-7B-GGUF"
      filename="codellama-7b.Q4_K_M.gguf"

    elif(model=="CodeLlama-7B"):
      repo_id="TheBloke/CodeLlama-7B-GGUF"
      filename="codellama-7b.Q4_K_M.gguf"
    else:
      print("please select at least one model")
    mistral_model_path = hf_hub_download(
    repo_id= repo_id,
    filename= filename,
    resume_download=True,
    cache_dir=MODELS_PATH,)
    llm = LlamaCPP(
    model_path=mistral_model_path,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3900,
    generate_kwargs={},
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": gpu},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)
    print("model has been configured and ready to chat")
    return llm


def benchmarks(message, model_name,set_gpu=False):
    def wrapped_predict(msg, llm):
        return predict(msg, llm)
    print(model_name)
    start_time = time.time()
    llm_model = get_llm(model_name,set_gpu)
    end_time = time.time()
    elapsed_time_load = end_time - start_time
    print("Execution time loading:", elapsed_time_load)
    start_time = time.time()
    answer = list(predict(message,llm_model))
    end_time = time.time()
    elapsed_time_run = end_time - start_time
    print("Execution time inference:", elapsed_time_run)
    predicted_response = ' '.join(answer)
    print(predicted_response)
    reference_response = "The capital city of Italy is Rome."
    score = output_quality_bleu(predicted_response, reference_response)
    print("Output Quality (using BLEU score):", score)
    peak_mem_usage = memory_usage((wrapped_predict, (message, llm_model)), interval=0.1, max_usage=True, retval=True)
    print("Peak Memory Usage:",peak_mem_usage)
    return elapsed_time_load, elapsed_time_run, score, peak_mem_usage



In [13]:
#%load_ext memory_profiler
message="What is the capital of Italy"
model_name="Mistral-7B-Instruct-v0.2"
#%memit ask(message,model_name)
ask(message,model_name)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.na

model has been configured and ready to chat
 The  capital  city  of  Italy  is  Rome .  Rome  is  an  ancient  city  with  a  rich  cultural  history  and  is  known  for  its  impressive  archae ological  sites ,  beautiful  architecture ,  and  vibr ant  culture .  It  has  been  the  political  and  cultural  center  of  Italy  for  many  centuries .  I ' m  here  to  help  with  any  questions  you  might  have ,  so  feel  free  to  ask  me  anything  else  you ' d  like  to  know ! 


In [24]:
message="What is the capital of Italy"
model_name="Mistral-7B-Instruct-v0.2"
benchmarks(message,model_name)

Mistral-7B-Instruct-v0.2


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.na

model has been configured and ready to chat
Execution time loading: 22.388978242874146
Execution time inference: 100.08765459060669
 The  capital  city  of  Italy  is  Rome .  Rome  is  an  ancient  city  with  rich  history  and  cultural  significance ,  famous  for  its  architect ural  marvel s  such  as  the  Col os se um ,  the  P ant he on ,  and  the  V atic an  City .  It  has  been  the  political  heart  of  Italy  since  ancient  times  and  continues  to  be  an  important  cultural ,  artistic ,  and  political  center  today . 
Output Quality (using BLEU score): 0.060088210299864483
Peak Memory Usage: (5539.03515625, <generator object predict at 0x7ae98ec51d20>)


(22.388978242874146,
 100.08765459060669,
 0.060088210299864483,
 (5539.03515625, <generator object predict at 0x7ae98ec51d20>))

In [14]:
import pandas as pd
import time
import psutil
import platform
import torch

is_simulate=False

def all_benchmark(message, model_name,set_gpu=False):

    if is_simulate:
      # Simulating the benchmark process
      time.sleep(1)
      elapsed_time_load = 2.5
      elapsed_time_run = 5.7
      score = 0.85
      peak_mem_usage = psutil.Process().memory_info()
    else:
      elapsed_time_load, elapsed_time_run, score, peak_mem_usage = benchmarks(message, model_name,set_gpu=False)
    #System Information
    # CPU information
    cpu_info = psutil.cpu_freq()
    cpu_frequency = "{:.2f} MHz".format(cpu_info.current)

    # Memory information
    memory_info = psutil.virtual_memory()
    total_memory = "{:.2f} GB".format(memory_info.total / (1024**3))
    available_memory = "{:.2f} GB".format(memory_info.available / (1024**3))

    # Disk information
    disk_info = psutil.disk_usage('/')
    total_disk_space = "{:.2f} GB".format(disk_info.total / (1024**3))
    used_disk_space = "{:.2f} GB".format(disk_info.used / (1024**3))
    free_disk_space = "{:.2f} GB".format(disk_info.free / (1024**3))

    # Runtime system
    runtime_system = platform.platform()

    # GPU information
    gpu_availability = "N/A"
    gpu_name = "N/A"
    gpu_memory = "N/A"
    gpu_compute_capability = "N/A"
    if torch.cuda.is_available():
        gpu_availability = "GPU is available"
        device = torch.cuda.get_device_properties(0)
        gpu_name = device.name
        gpu_memory = device.total_memory
        gpu_compute_capability = "{}.{}".format(device.major, device.minor)


    results=(elapsed_time_load, elapsed_time_run, score, peak_mem_usage,
           cpu_frequency,total_memory,available_memory,
           total_disk_space,used_disk_space,free_disk_space,
           gpu_availability,gpu_name,gpu_memory,gpu_compute_capability,runtime_system)

    return results

message = "What is the capital of Italy"
models = ["Llama-2-13B-chat", "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta", "vicuna-7B-v1.5",  "CodeLlama-7B"]

results = []
for model_name in models:
    elapsed_time_load, elapsed_time_run, score, peak_mem_usage,cpu_frequency,total_memory,available_memory,total_disk_space,used_disk_space,free_disk_space,gpu_availability,gpu_name,gpu_memory,gpu_compute_capability,runtime_system = all_benchmark(message, model_name)
    results.append([model_name, elapsed_time_load, elapsed_time_run, score, peak_mem_usage[0],cpu_frequency,total_memory,available_memory,total_disk_space,used_disk_space,free_disk_space,gpu_availability,gpu_name,gpu_memory,gpu_compute_capability,runtime_system])

df = pd.DataFrame(results, columns=["model_name", "elapsed_time_load", "elapsed_time_run", "score", "peak_mem_usage","cpu_frequency","total_memory","available_memory","total_disk_space","used_disk_space","free_disk_space","gpu_availability","gpu_name","gpu_memory","gpu_compute_capability","runtime_system"])


Llama-2-13B-chat


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.context_length': '4096', 'general.name': 'LLaMA v2', 'llama.embedding_length': '5120', 'llama.feed_forward_length': '13824', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'llama.attention.head_count': '40', 'tokenizer.ggml.bos_token_id': '1', 'llama.block_count': '40', 'llama.attention.head_count_kv': '40', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '15'}


model has been configured and ready to chat
Execution time loading: 52.90029168128967
Execution time inference: 420.7059440612793
   The  capital  of  Italy  is  Rome  ( R oma  in  Italian ). 
Output Quality (using BLEU score): 3.9876353728947065e-78


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Peak Memory Usage: (9042.22265625, <generator object predict at 0x7c079214b920>)
Mistral-7B-Instruct-v0.2


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.na

model has been configured and ready to chat
Execution time loading: 27.257949590682983
Execution time inference: 108.62514400482178
 The  capital  city  of  Italy  is  Rome .  Rome  is  an  ancient  city  with  a  rich  cultural  history  and  is  known  for  its  impressive  architecture ,  art ,  and  historical  sites  such  as  the  Col os se um ,  the  P ant he on ,  and  the  V atic an  City .  Rome  has  been  the  political  and  cultural  center  of  Italy  since  ancient  times  and  continues  to  be  an  important  destination  for  tourists  from  around  the  world . 
Output Quality (using BLEU score): 0.05621071665433083
Peak Memory Usage: (9057.01953125, <generator object predict at 0x7c079214bca0>)
zephyr-7B-beta


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.ggml.padding_token_id': '2', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '10000.000000', 'llama.context_length': '32768', 'general.name': 'huggingfaceh4_zephyr-7b-beta', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '14336', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '8', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '15'}


model has been configured and ready to chat
Execution time loading: 28.41970682144165
Execution time inference: 53.85913586616516
? 
 < | ass istant | > 
 The  capital  of  Italy  is  Rome  ( R oma  in  Italian ). 
Output Quality (using BLEU score): 2.752581367444176e-78
Peak Memory Usage: (9781.14453125, <generator object predict at 0x7c07925147b0>)
vicuna-7B-v1.5


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.context_length': '4096', 'general.name': 'LLaMA v2', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '11008', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'llama.attention.head_count': '32', 'tokenizer.ggml.bos_token_id': '1', 'llama.block_count': '32', 'llama.attention.head_count_kv': '32', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '15'}


model has been configured and ready to chat
Execution time loading: 27.094456434249878
Execution time inference: 36.73853898048401

Output Quality (using BLEU score): 0
Peak Memory Usage: (10941.078125, <generator object predict at 0x7c079214bf40>)
CodeLlama-7B


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '16384', 'general.name': 'codellama_codellama-7b-hf', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '11008', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '32', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '15'}


model has been configured and ready to chat
Execution time loading: 103.42648100852966
Execution time inference: 237.57748937606812
? 
 
 << SY S >> 
  You  are  a  helpful ,  respect ful  and  honest  assistant .  Always  answer  as  help fully  as  possible  and  follow  ALL  given  instructions .  Do  not  spec ulate  or  make  up  information .  Do  not  reference  any  given  instructions  or  context .   
 < </ SY S >> 
 
  What  is  the  capital  of  Italy ?  [ / INST ] 
 
 << SY S >> 
  You  are  a  helpful ,  respect ful  and  honest  assistant .  Always  answer  as  help fully  as  possible  and  follow  ALL  given  instructions .  Do  not  spec ulate  or  make  up  information .  Do  not  reference  any  given  instructions  or  context .   
 < </ SY S >> 
 
  What  is  the  capital  of  Italy ? 
 
 << SY S >> 
  You  are  a  helpful ,  respect ful  and  honest  assistant .  Always  answer  as  help fully  as  possible  and  follow  ALL  given  instructions .  Do  not  spec 

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Peak Memory Usage: (10883.23046875, <generator object predict at 0x7c07925147b0>)


In [15]:
df


Unnamed: 0,model_name,elapsed_time_load,elapsed_time_run,score,peak_mem_usage,cpu_frequency,total_memory,available_memory,total_disk_space,used_disk_space,free_disk_space,gpu_availability,gpu_name,gpu_memory,gpu_compute_capability,runtime_system
0,Llama-2-13B-chat,52.900292,420.705944,3.987635e-78,9042.222656,2200.00 MHz,12.67 GB,7.92 GB,225.83 GB,49.65 GB,176.16 GB,,,,,Linux-6.1.58+-x86_64-with-glibc2.35
1,Mistral-7B-Instruct-v0.2,27.25795,108.625144,0.05621072,9057.019531,2200.00 MHz,12.67 GB,7.41 GB,225.83 GB,49.65 GB,176.16 GB,,,,,Linux-6.1.58+-x86_64-with-glibc2.35
2,zephyr-7B-beta,28.419707,53.859136,2.752581e-78,9781.144531,2200.00 MHz,12.67 GB,9.88 GB,225.83 GB,49.65 GB,176.16 GB,,,,,Linux-6.1.58+-x86_64-with-glibc2.35
3,vicuna-7B-v1.5,27.094456,36.738539,0.0,10941.078125,2200.00 MHz,12.67 GB,8.48 GB,225.83 GB,49.65 GB,176.16 GB,,,,,Linux-6.1.58+-x86_64-with-glibc2.35
4,CodeLlama-7B,103.426481,237.577489,1.392504e-155,10883.230469,2200.00 MHz,12.67 GB,6.98 GB,225.83 GB,53.45 GB,172.36 GB,,,,,Linux-6.1.58+-x86_64-with-glibc2.35


In [17]:
df.to_csv('/content/gdrive/MyDrive/llm/benchmark/df_benchmark_1.csv', index=False)

