# GPU 확인

In [None]:
import torch

assert torch.cuda.is_available()

In [None]:
# Google Drive Mount
from google.colab import drive

drive.mount('/content/drive')

# Install text-generation web ui

In [None]:
%cd /content

In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

In [None]:
!git clone https://github.com/oobabooga/text-generation-webui.git

In [None]:
!pip install -r ./text-generation-webui/requirements.txt

# load Some file to colab local drive

In [None]:
# configure for your setting

repo_name = "davidkim205/komt-mistral-7b-v1"
backup_path = "/content/drive/MyDrive/자료"
qtype = "q8_0"


In [None]:
import os
from pathlib import Path
web_ui_path = Path("/content/text-generation-webui/")
model_file_name = repo_name.split("/")[-1]+"_"+qtype+".bin"
model_name = repo_name.split('/')[-1]
model_file_path = Path(backup_path) / model_file_name
dest_model_path = web_ui_path / "models" / model_file_name

# if there is no model file in google drive, download, quantize, copy and paste.
if not os.path.exists(model_file_path):
  !git clone https://github.com/ggerganov/llama.cpp.git
  %cd llama.cpp
  !git pull && make clean && LLAMA_CUBLAS=1 make
  !pip install -r requirements.txt
  %cd models
  !git clone https://huggingface.co/{repo_name}.git
  fp16 = f"{model_name}.gguf.fp16.bin"
  %cd ..
  !python convert.py models/{model_name} --outtype f16 --outfile models/{fp16} # convert to fp16
  !rm -rf models/{model_name}
  !./quantize models/{fp16} {dest_model_path} {qtype} # convert to q8
  !cp {dest_model_path} {model_file_path}
  !rm models/{fp16}

In [None]:
assert os.path.exists(model_file_path), model_file_path
if not os.path.exists(dest_model_path):
  !cp {model_file_path} {dest_model_path}
  print("model file has been loaded")
else:
  print("model file was loaded already")
%cd {web_ui_path}

# LLM Local Model Configuration

https://github.com/oobabooga/text-generation-webui/blob/main/settings-template.yaml

In [None]:
custom_config = {
    "max_new_tokens" : 1024,
    "truncation_length" : 5120,
    "instruction_template" : "Mistral",
}

In [None]:
import yaml
from pprint import pprint

config_template_file_name = "settings-template.yaml"
config_template_path = web_ui_path / config_template_file_name

with open(config_template_path, 'r') as f:
  config  = yaml.safe_load(f)

config = {**config, **custom_config}
pprint(config)

# API extension 설치

In [None]:
!pip install -r ./extensions/api/requirements.txt
!pip install -r ./extensions/openai/requirements.txt

# Server 구동

In [None]:
n_gpu_layers = 35
n_ctx = 5120

In [None]:
!python server.py --model {model_file_name} --loader llamacpp --n-gpu-layers {str(n_gpu_layers)} --n_ctx {str(n_ctx)} \
 --public-api --share

# Colab-SSH 설치 및 실행

In [None]:
# !pip install colab-ssh

In [None]:
# from colab_ssh import launch_ssh
# launch_ssh(NGROK_TOKEN, PASSWORD)

In [None]:
# import numpy as np

# I = np.eye(3)

# while True:
#   I = I@I