# Install packages

In [None]:
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
!pip install -r llama.cpp/requirements.txt

Cloning into 'llama.cpp'...
remote: Enumerating objects: 9308, done.[K
remote: Counting objects: 100% (3703/3703), done.[K
remote: Compressing objects: 100% (498/498), done.[K
remote: Total 9308 (delta 3430), reused 3356 (delta 3205), pack-reused 5605[K
Receiving objects: 100% (9308/9308), 8.48 MiB | 16.95 MiB/s, done.
Resolving deltas: 100% (6417/6417), done.
Already up to date.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_K_QUANTS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function -pthread -march=native -mtune=native 
I CXXFLAGS:  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_K_QUANTS  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar -Wno-form

In [None]:
HF_MODEL_PATH = "VietnamAIHub/Vietnamese_llama2_7B_8K_SFT_General_domain"

In [None]:
# Download model
!git lfs install
!git clone https://huggingface.co/{HF_MODEL_PATH}

Git LFS initialized.
Cloning into 'Vietnamese_llama2_7B_8K_SFT_General_domain'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 21 (delta 3), reused 0 (delta 0), pack-reused 4[K
Unpacking objects: 100% (21/21), 6.38 KiB | 1.06 MiB/s, done.
Filtering content: 100% (3/3), 4.55 GiB | 8.30 MiB/s, done.
Encountered 1 file(s) that may not have been copied correctly on Windows:
	pytorch_model-00001-of-00002.bin

See: `git lfs help smudge` for more details.


# Quantize model

In [None]:
MODEL_NAME = HF_MODEL_PATH.split('/')[-1]
GGML_VERSION = "gguf"

# Convert to fp16
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{GGML_VERSION}.fp16.bin"
!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}

Loading model file Vietnamese_llama2_7B_8K_SFT_General_domain/pytorch_model-00001-of-00002.bin
Loading model file Vietnamese_llama2_7B_8K_SFT_General_domain/pytorch_model-00001-of-00002.bin
Loading model file Vietnamese_llama2_7B_8K_SFT_General_domain/pytorch_model-00002-of-00002.bin
params = Params(n_vocab=32000, n_embd=4096, n_layer=32, n_ctx=8192, n_ff=11008, n_head=32, n_head_kv=32, f_norm_eps=1e-05, f_rope_freq_base=None, f_rope_scale=2.0, ftype=<GGMLFileType.MostlyF16: 1>, path_model=PosixPath('Vietnamese_llama2_7B_8K_SFT_General_domain'))
Loading vocab file 'Vietnamese_llama2_7B_8K_SFT_General_domain/tokenizer.model', type 'spm'
Permuting layer 0
Permuting layer 1
Permuting layer 2
Permuting layer 3
Permuting layer 4
Permuting layer 5
Permuting layer 6
Permuting layer 7
Permuting layer 8
Permuting layer 9
Permuting layer 10
Permuting layer 11
Permuting layer 12
Permuting layer 13
Permuting layer 14
Permuting layer 15
Permuting layer 16
Permuting layer 17
Permuting layer 18
Permu

In [None]:
QUANTIZATION_METHODS = ["q4_k_m", "q5_k_m"]

for method in QUANTIZATION_METHODS:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{GGML_VERSION}.{method}.bin"
    !./llama.cpp/quantize {fp16} {qtype} {method}

ggml_init_cublas: found 1 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5
main: build = 1279 (e519621)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing 'Vietnamese_llama2_7B_8K_SFT_General_domain/vietnamese_llama2_7b_8k_sft_general_domain.gguf.fp16.bin' to 'Vietnamese_llama2_7B_8K_SFT_General_domain/vietnamese_llama2_7b_8k_sft_general_domain.gguf.q4_k_m.bin' as Q4_K_M
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from Vietnamese_llama2_7B_8K_SFT_General_domain/vietnamese_llama2_7b_8k_sft_general_domain.gguf.fp16.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight f16      [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight f16      [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    3:    

# Testing

In [None]:
import os

model_list = [file for file in os.listdir(MODEL_NAME) if GGML_VERSION in file]
prompt = input("Enter your prompt: ")
chosen_method = input("Please specify the quantization method to run the model (options: " + ", ".join(model_list) + "): ")

# Verify the chosen method is in the list
if chosen_method not in model_list:
    print("Invalid method chosen!")
else:
    !./llama.cpp/main -m {qtype} -n 512 --color -ngl 35 -p "{prompt}"

Enter your prompt: a
Please specify the quantization method to run the model (options: vi-llama2-qlora.gguf.fp16.bin, vi-llama2-qlora.gguf.q5_k_m.bin, vi-llama2-qlora.gguf.q4_k_m.bin): vi-llama2-qlora.gguf.q4_k_m.bin
vi-llama2-qlora/vi-llama2-qlora.gguf.vi-llama2-qlora.gguf.q5_k_m.bin.bin
Log start
main: build = 1267 (bc9d3e3)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed  = 1695358865
ggml_init_cublas: found 1 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5
error loading model: failed to open vi-llama2-qlora/vi-llama2-qlora.gguf.vi-llama2-qlora.gguf.q5_k_m.bin.bin: No such file or directory
llama_load_model_from_file: failed to load model
llama_init_from_gpt_params: error: failed to load model 'vi-llama2-qlora/vi-llama2-qlora.gguf.vi-llama2-qlora.gguf.q5_k_m.bin.bin'
main: error: unable to load model


In [None]:
prompts_input="Ai là phó goat, Ronaldo de lima hay Ronaldo Cris"
system_prompt=f"<s>[INST] <<SYS>>\n You are a helpful assistant, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
that your responses are socially unbiased and positive in nature.\
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
correct. If you don't know the answer to a question, please response as language model you are not able to respone detailed to these kind of question.\n<</SYS>>\n\n {prompts_input} [/INST] "

In [None]:
qtype = "/content/Vietnamese_llama2_7B_8K_SFT_General_domain/vietnamese_llama2_7b_8k_sft_general_domain.gguf.q5_k_m.bin"
!./llama.cpp/main -m {qtype} -n 2048 --color -ngl 35 -p "{system_prompt}"

Log start
main: build = 1279 (e519621)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: seed  = 1695899192
ggml_init_cublas: found 1 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5
llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /content/Vietnamese_llama2_7B_8K_SFT_General_domain/vietnamese_llama2_7b_8k_sft_general_domain.gguf.q5_k_m.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_

# Python binding

In [None]:
!pip install -q huggingface_hub
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.7.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.7-cp310-cp310-manylinux_2_35_x86_64.whl size=6276865 sha256=4b3ff0fec0e7ad1d4d501185a20356b30759c6a79d56e25f012326f5b9bf8004
  Stored i

In [None]:
GGML_MODEL_PATH = "/content/Vietnamese_llama2_7B_8K_SFT_General_domain/vietnamese_llama2_7b_8k_sft_general_domain.gguf.q5_k_m.bin"

In [None]:
prompts_input="Ai là phó goat, Ronaldo de lima hay Ronaldo Cris"
system_prompt=f"<s>[INST] <<SYS>>\n You are a helpful assistant, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
that your responses are socially unbiased and positive in nature.\
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
correct. If you don't know the answer to a question, please response as language model you are not able to respone detailed to these kind of question.\n<</SYS>>\n\n {prompts_input} [/INST] "

In [None]:
from llama_cpp import Llama
GGML_MODEL_PATH = "/content/Vietnamese_llama2_7B_8K_SFT_General_domain/vietnamese_llama2_7b_8k_sft_general_domain.gguf.q4_k_m.bin"
model = Llama(model_path=GGML_MODEL_PATH, n_ctx=2048, n_threads=4, seed=42, n_gpu_layers=128)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


In [None]:
print(system_prompt)

<s>[INST] <<SYS>>
 You are a helpful assistant, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensurethat your responses are socially unbiased and positive in nature.If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please response as language model you are not able to respone detailed to these kind of question.
<</SYS>>

 Ai là phó goat, Ronaldo de lima hay Ronaldo Cris [/INST] 


In [None]:
print(model(system_prompt, max_tokens=1024, temperature=0.8, repeat_penalty=1.1, echo=True)["choices"][0]["text"])

Llama.generate: prefix-match hit


<s>[INST] <<SYS>>
 You are a helpful assistant, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensurethat your responses are socially unbiased and positive in nature.If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please response as language model you are not able to respone detailed to these kind of question.
<</SYS>>

 Ai là phó goat, Ronaldo de lima hay Ronaldo Cris [/INST]  Ronaldo Lima - còn được gọi là Ronaldinho hay Ronaldinho Luis de Oliveira - sinh ngày 22 tháng 9 năm 1980, là một cựu cầu thủ bóng đá chuyên nghiệp người Brazil. Từ năm 1998 đến năm 2013, anh đã thi đấu cho một số câu lạc bộ tại châu Âu và Mỹ. Trong hơn mười năm qua, Ronaldo là một trong những tiền đạo hàng đầu thế giới. Nổi tiếng với 