<a href="https://colab.research.google.com/github/ruslanmv/Open-Source-LLM-Chatbot/blob/master/Benchmarks_Open_Source_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Colab Pro notebook from https://github.com/ruslanmv/Open-Source-LLM-Chatbot**

# Benchmark Open Source LLM **Chatbot**


In [1]:
#@markdown # Connect Google Drive
from google.colab import drive
from IPython.display import clear_output
import ipywidgets as widgets
import os

def inf(msg, style, wdth): inf = widgets.Button(description=msg, disabled=True, button_style=style, layout=widgets.Layout(min_width=wdth));display(inf)
Shared_Drive = "" #@param {type:"string"}
#@markdown - Leave empty if you're not using a shared drive

print("[0;33mConnecting...")
drive.mount('/content/gdrive')

if Shared_Drive!="" and os.path.exists("/content/gdrive/Shareddrives"):
  mainpth="Shareddrives/"+Shared_Drive
else:
  mainpth="MyDrive"

clear_output()
inf('\u2714 Done','success', '50px')

#@markdown ---

Button(button_style='success', description='✔ Done', disabled=True, layout=Layout(min_width='50px'), style=But…

In [2]:
#@markdown # Install/Update Open Source LLM Chatbot repo
from IPython.utils import capture
from IPython.display import clear_output
from subprocess import getoutput
import ipywidgets as widgets
import sys
import fileinput
import os
import time
import base64
import gdown
from gdown.download import get_url_from_gdrive_confirmation
import requests
from urllib.request import urlopen, Request
from urllib.parse import urlparse, parse_qs, unquote
from tqdm import tqdm
import six
if not os.path.exists("/content/gdrive"):
  print('[1;31mGdrive not connected, using temporary colab storage ...')
  time.sleep(4)
  mainpth="MyDrive"
  !mkdir -p /content/gdrive/$mainpth
  Shared_Drive=""

if Shared_Drive!="" and not os.path.exists("/content/gdrive/Shareddrives"):
  print('[1;31mShared drive not detected, using default MyDrive')
  mainpth="MyDrive"

with capture.capture_output() as cap:
  def inf(msg, style, wdth): inf = widgets.Button(description=msg, disabled=True, button_style=style, layout=widgets.Layout(min_width=wdth));display(inf)
  fgitclone = "git clone --depth 1"
  %mkdir -p /content/gdrive/$mainpth/llm
  %cd /content/gdrive/$mainpth/llm

  !git clone -q --branch master https://github.com/ruslanmv/Open-Source-LLM-Chatbot.git /content/gdrive/$mainpth/llm/chatbot/
  !git fetch
  !git pull
  !mkdir -p /content/gdrive/$mainpth/llm/chatbot/cache/
  !pip install -r /content/gdrive/$mainpth/llm/chatbot/requirements.txt
  os.environ['TRANSFORMERS_CACHE']=f"/content/gdrive/{mainpth}/llm/chatbot/cache"
  os.environ['TORCH_HOME'] = f"/content/gdrive/{mainpth}/llm/chatbot/cache"

with capture.capture_output() as cap:
  %cd /content/gdrive/$mainpth/llm/chatbot/
clear_output()
inf('\u2714 Done','success', '50px')

#@markdown ---

Button(button_style='success', description='✔ Done', disabled=True, layout=Layout(min_width='50px'), style=But…

In [4]:
#@markdown # Model to Download/Load and Benchmark
from huggingface_hub import hf_hub_download
Use_Temp_Storage = False #@param {type:"boolean"}
#@markdown - If not, make sure you have enough space on your gdrive

#@markdown ---

Model_Version = "Mistral-7B-Instruct-v0.2" #@param ["Mistral-7B-Instruct-v0.2", "CodeLlama-7B", "Llama-2-13B-chat", "Falcon-7B-Instruct","zephyr-7B-beta","vicuna-7B-v1.5"]

if Use_Temp_Storage:
   MODELS_PATH = "./models"
else:
    MODELS_PATH ='/content/gdrive/'+mainpth+'/llm/chatbot'+'/models/'

def llmdl(ver, Use_Temp_Storage):

  repo_id=''
  if(ver=="Llama-2-13B-chat"):
        repo_id="TheBloke/Llama-2-13B-chat-GGUF"
        filename="llama-2-13b-chat.Q4_K_M.gguf"
  elif(ver=="Mistral-7B-Instruct-v0.2") :
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
        filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf"
  elif(ver=="zephyr-7B-beta"):
        repo_id="TheBloke/zephyr-7B-beta-GGUF"
        filename="zephyr-7b-beta.Q4_K_M.gguf"
  elif(ver=="vicuna-7B-v1.5"):
        repo_id="TheBloke/vicuna-7B-v1.5-GGUF"
        filename="vicuna-7b-v1.5.Q4_K_M.gguf"
  elif(ver=="Falcon-7B-Instruct"):
        repo_id="TheBloke/Falcon-7B-Instruct-GGML"
        filename="falcon-7b-instruct.ggccv1.q4_1.bin"
  elif(ver=="CodeLlama-7B"):
        repo_id="TheBloke/CodeLlama-7B-GGUF"
        filename="codellama-7b.Q4_K_M.gguf"
  if Use_Temp_Storage:
      os.makedirs('/content/models', exist_ok=True)
      model='/content/models/'+ver
  else:
      model=MODELS_PATH+ver
  link=repo_id
  if not os.path.exists(model):
    model_path = hf_hub_download(
    repo_id= repo_id,
    filename= filename,
    resume_download=True,
    cache_dir=MODELS_PATH,)

    if os.path.exists(model_path):
      clear_output()
      inf('\u2714 Done','success', '50px')
    else:
      inf('\u2718 Something went wrong, try again','danger', "250px")
  else:
      clear_output()
      inf('\u2714 Model already exists','primary', '300px')

  return model

inf('\u2718 Downloading files','success', "400px")
PATH_to_MODEL=llmdl(Model_Version, Use_Temp_Storage)

if os.path.exists(str(PATH_to_MODEL)):
  inf('\u2714 Using the custom model.','success', '200px')
  model=PATH_to_MODEL

#@markdown ---

Button(button_style='success', description='✔ Done', disabled=True, layout=Layout(min_width='50px'), style=But…

In [8]:
#@markdown # Start Benchmark
from IPython.utils import capture
import time
import sys
import fileinput
from pyngrok import ngrok, conf
import re

Use_Cloudflare_Tunnel = False #@param {type:"boolean"}
#@markdown - Offers better gradio responsivity

Ngrok_token = "" #@param {type:"string"}

#@markdown - Input your ngrok token if you want to use ngrok server

User = "" #@param {type:"string"}
Password= "" #@param {type:"string"}
#@markdown - Add credentials to your Gradio interface (optional)

auth=f"--gradio-auth {User}:{Password}"
if User =="" or Password=="":
  auth=""


#with capture.capture_output() as cap:
#  %cd /content/gdrive/$mainpth/llm/chatbot/modules/

share=''
if Ngrok_token!="":
  ngrok.kill()
  srv=ngrok.connect(7860, pyngrok_config=conf.PyngrokConfig(auth_token=Ngrok_token) , bind_tls=True).public_url

  for line in fileinput.input('/usr/local/lib/python3.10/dist-packages/gradio/blocks.py', inplace=True):
    if line.strip().startswith('self.server_name ='):
        line = f'            self.server_name = "{srv[8:]}"\n'
    if line.strip().startswith('self.protocol = "https"'):
        line = '            self.protocol = "https"\n'
    if line.strip().startswith('if self.local_url.startswith("https") or self.is_colab'):
        line = ''
    if line.strip().startswith('else "http"'):
        line = ''
    sys.stdout.write(line)

elif Use_Cloudflare_Tunnel:
  with capture.capture_output() as cap:
    !pkill cloudflared
    time.sleep(4)
    !nohup cloudflared tunnel --url http://localhost:7860 > /content/srv.txt 2>&1 &
    time.sleep(4)
    with open('/content/srv.txt', "r") as file: text = file.read()
    srv= re.findall(r"https?://(?:\S+?\.)?trycloudflare\.com\S*", text)[0]

    for line in fileinput.input('/usr/local/lib/python3.10/dist-packages/gradio/blocks.py', inplace=True):
      if line.strip().startswith('self.server_name ='):
          line = f'            self.server_name = "{srv[8:]}"\n'
      if line.strip().startswith('self.protocol = "https"'):
          line = '            self.protocol = "https"\n'
      if line.strip().startswith('if self.local_url.startswith("https") or self.is_colab'):
          line = ''
      if line.strip().startswith('else "http"'):
          line = ''
      sys.stdout.write(line)

    !rm /content/srv.txt

else:
  share='--share'


#@markdown ---

# Loading Single Model

In [10]:
print(MODELS_PATH)

/content/gdrive/MyDrive/llm/chatbot/models/


In [38]:
from huggingface_hub import hf_hub_download
import logging
import sys
import gradio as gr
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

from huggingface_hub import hf_hub_download
from llama_index.llms import LlamaCPP

#MODELS_PATH = "./models"

mistral_model_path = hf_hub_download(
    repo_id= "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
    filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    resume_download=True,
    cache_dir=MODELS_PATH,
)

llm = LlamaCPP(
    model_path=mistral_model_path,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3900,
    model_kwargs={"n_gpu_layers": -1},
    verbose=True,
)

def predict(message):
    response = llm.stream_complete(message)
    for bot_response in response:
        token = bot_response.delta
        yield token

def ask(message):
    answer = list(predict(message))
    print(' '.join(answer))


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.na

In [39]:
ask("What is the capital of Italy")

"?  Rome  is  not  the  capital  city  of  Italy .  The  capital  city  of  Italy  is  actually  Rome ' s  political  and  administrative  rival ,  Milan ,  or  more  specifically ,  the  city  of  Rome  itself  is  the  capital  city  of  the  L az io  region ,  while  the  political  and  administrative  center  of  the  country  is  located  in  Milan  in  the  northern  region  of  L omb ard y .  This  mis con ception  likely  ar ises  due  to  Rome ' s  historical  significance  as  the  cr ad le  of  Western  civilization  and  its  status  as  a  major  tourist  destination ,  overs h adow ing  Milan ' s  role  as  the  nation ' s  capital . "

In [41]:
def get_llm(model):
    repo_id=""
    filename=""
    if(model=="Llama-2-13B-chat"):
      repo_id="TheBloke/Llama-2-13B-chat-GGUF"
      filename="llama-2-13b-chat.Q4_K_M.gguf"
    elif(model=="Mistral-7B-Instruct-v0.2") :
      repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
      filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf"
    elif(model=="zephyr-7B-beta"):
      repo_id="TheBloke/zephyr-7B-beta-GGUF"
      filename="zephyr-7b-beta.Q4_K_M.gguf"
    elif(model=="vicuna-7B-v1.5"):
      repo_id="TheBloke/vicuna-7B-v1.5-GGUF"
      filename="vicuna-7b-v1.5.Q4_K_M.gguf"
    elif(model=="Falcon-7B-Instruct"):
      repo_id="TheBloke/Falcon-7B-Instruct-GGML"
      filename="falcon-7b-instruct.ggccv1.q4_1.bin"
    elif(model=="CodeLlama-7B"):
      repo_id="TheBloke/CodeLlama-7B-GGUF"
      filename="codellama-7b.Q4_K_M.gguf"
    else:
      print("please select at least one model")
    mistral_model_path = hf_hub_download(
    repo_id= repo_id,
    filename= filename,
    resume_download=True,
    cache_dir=MODELS_PATH,)
    llm = LlamaCPP(
    model_path=mistral_model_path,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3900,
    generate_kwargs={},
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)
    print("model has been configured and ready to chat")
    return llm

In [80]:
def predict(message,llm):
    response = llm.stream_complete(message)
    for bot_response in response:
        token = bot_response.delta
        yield token

def ask(message,model_name):
    llm=get_llm(model_name)
    answer = list(predict(message,llm))
    print(' '.join(answer))

In [45]:
message="What is the capital of Italy"
model_name="Mistral-7B-Instruct-v0.2"
ask(message,model_name)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.na

model has been configured and ready to chat
 The  capital  city  of  Italy  is  Rome .  Rome  is  an  ancient  city  with  a  rich  cultural  history  and  is  known  for  its  impressive  archae ological  sites ,  beautiful  architecture ,  and  vibr ant  culture .  It  has  been  the  political  and  cultural  center  of  Italy  for  many  centuries .  I ' m  here  to  help  with  any  questions  you  might  have ,  so  feel  free  to  ask  me  anything  else  you ' d  like  to  know ! 


In [50]:
import time
def benchmkark_time(message,model_name):
    start_time = time.time()
    llm_model=get_llm(model_name)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Execution time loading:" ,elapsed_time)
    start_time = time.time()
    answer = list(predict(message,llm_model))
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Execution time inference:", elapsed_time)
    print(' '.join(answer))

In [51]:
message="What is the capital of Italy"
model_name="Mistral-7B-Instruct-v0.2"
benchmkark_time(message,model_name)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.na

model has been configured and ready to chat
Execution time loading: 25.78208065032959
Execution time inference: 94.06341600418091
 The  capital  city  of  Italy  is  Rome .  Rome  is  an  ancient  city  with  a  rich  cultural  history ,  famous  for  its  architect ural  marvel s  such  as  the  Col os se um ,  the  P ant he on ,  and  the  V atic an  City ,  which  is  the  headquarters  of  the  Roman  Catholic  Church .  Rome  has  been  an  influential  center  of  art ,  culture ,  and  politics  for  over  two  thousand  years . 


In [54]:
predicted_response=" The  capital  city  of  Italy  is  Rome .  Rome  is  an  ancient  city  with  a  rich  cultural  history ,  famous  for  its  architect ural  marvel s  such  as  the  Col os se um ,  the  P ant he on ,  and  the  V atic an  City ,  which  is  the  headquarters  of  the  Roman  Catholic  Church .  Rome  has  been  an  influential  center  of  art ,  culture ,  and  politics  for  over  two  thousand  years . "
reference_response="The  capital  city  of  Italy  is  Rome ."

In [53]:
from nltk.translate.bleu_score import sentence_bleu

In [55]:
def output_quality_bleu(predicted_response, reference_response):
    predicted_tokens = predicted_response.split()
    reference_tokens = reference_response.split()
    score = sentence_bleu([reference_tokens], predicted_tokens)
    return score

In [56]:
output_quality_bleu(predicted_response, reference_response)

0.08594487050311704

In [57]:
import psutil

def memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    return mem_info.rss

In [58]:
memory_usage()

1975603200

In [59]:
!pip install memory-profiler



Collecting memory-profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.61.0


In [60]:
%load_ext memory_profiler

In [62]:
message="What is the capital of Italy"
model_name="Mistral-7B-Instruct-v0.2"
%memit ask(message,model_name)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.na

model has been configured and ready to chat
 The  capital  city  of  Italy  is  Rome . 
peak memory: 6550.66 MiB, increment: 4664.22 MiB


In [63]:
from memory_profiler import memory_usage
import time


In [64]:
def wrapper_predict(message, llm):
    list(predict(message, llm))

In [67]:
def wrapper_ask(message, model_name):
    ask(message, model_name)

In [69]:
message="What is the capital of Italy"
model_name="Mistral-7B-Instruct-v0.2"
start_time = time.time()
mem_usage = memory_usage((wrapper_ask, (message, model_name)), interval=0.1, max_usage=True)
end_time = time.time()
elapsed_time = end_time - start_time

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.na

model has been configured and ready to chat
 The  capital  city  of  Italy  is  Rome .  Rome  is  an  ancient  city  with  a  rich  cultural  history  and  is  known  for  its  architect ural  marvel s  such  as  the  Col os se um  and  the  V atic an  City .  It  has  been  the  political  heart  of  Italy  since  ancient  times  and  continues  to  be  an  important  cultural ,  artistic ,  and  political  center  today . 


In [86]:
import time
def benchmkarks(message,model_name):
    print(model_name)
    start_time = time.time()
    llm_model=get_llm(model_name)
    end_time = time.time()
    elapsed_time_load = end_time - start_time
    print("Execution time loading:" ,elapsed_time_load)
    start_time = time.time()
    answer = list(predict(message,llm_model))
    end_time = time.time()
    elapsed_time_run = end_time - start_time
    print("Execution time inference:", elapsed_time_run)
    predicted_response=' '.join(answer)
    print(predicted_response)
    reference_response="The  capital  city  of  Italy  is  Rome."
    score=output_quality_bleu(predicted_response, reference_response)
    print("Output Quality (using BLEU score):",score)

    return elapsed_time_load, elapsed_time_run,score

In [87]:
message="What is the capital of Italy"
model_name="Mistral-7B-Instruct-v0.2"
benchmkarks(message,model_name)

Mistral-7B-Instruct-v0.2


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.na

model has been configured and ready to chat
Execution time loading: 27.022007703781128
Execution time inference: 93.70565366744995
 The  capital  city  of  Italy  is  Rome .  Rome  is  an  ancient  city  with  a  rich  cultural  history ,  famous  for  its  architect ural  marvel s  such  as  the  Col os se um  and  the  P ant he on ,  as  well  as  its  artistic  and  historical  sites  like  the  V atic an  City .  It  has  been  an  influential  center  of  art ,  culture ,  and  politics  for  centuries . 
Output Quality (using BLEU score): 0.06268260360206092


(27.022007703781128, 93.70565366744995, 0.06268260360206092)

In [98]:
import time
from memory_profiler import memory_usage

def benchmarks(message, model_name):
    def wrapped_predict(msg, llm):
        return predict(msg, llm)
    print(model_name)
    start_time = time.time()
    llm_model = get_llm(model_name)
    end_time = time.time()
    elapsed_time_load = end_time - start_time
    print("Execution time loading:", elapsed_time_load)
    start_time = time.time()
    answer = list(predict(message,llm_model))
    end_time = time.time()
    elapsed_time_run = end_time - start_time
    print("Execution time inference:", elapsed_time_run)
    predicted_response = ' '.join(answer)
    print(predicted_response)
    reference_response = "The capital city of Italy is Rome."
    score = output_quality_bleu(predicted_response, reference_response)
    print("Output Quality (using BLEU score):", score)
    peak_mem_usage = memory_usage((wrapped_predict, (message, llm_model)), interval=0.1, max_usage=True, retval=True)
    print("Peak Memory Usage:",peak_mem_usage)
    return elapsed_time_load, elapsed_time_run, score, peak_mem_usage



In [99]:
message="What is the capital of Italy"
model_name="Mistral-7B-Instruct-v0.2"
benchmarks(message,model_name)

Mistral-7B-Instruct-v0.2


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 
Model metadata: {'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '1000000.000000', 'llama.context_length': '32768', 'general.na

model has been configured and ready to chat
Execution time loading: 25.448474645614624
Execution time inference: 43.61459922790527
 The  capital  city  of  Italy  is  Rome . 
Output Quality (using BLEU score): 0.6803749333171202
Peak Memory Usage: (8138.94921875, <generator object predict at 0x7fc5f67c2490>)


(25.448474645614624,
 43.61459922790527,
 0.6803749333171202,
 (8138.94921875, <generator object predict at 0x7fc5f67c2490>))