# GPU 확인

In [1]:
import torch

assert torch.cuda.is_available()

In [2]:
# Google Drive Mount
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install text-generation web ui

In [3]:
%cd /content

/content


In [4]:
!git clone https://github.com/oobabooga/text-generation-webui.git

fatal: destination path 'text-generation-webui' already exists and is not an empty directory.


In [5]:
!pip install -r ./text-generation-webui/requirements.txt

Ignoring exllamav2: markers 'platform_system != "Darwin" and platform_machine != "x86_64"' don't match your environment
Collecting git+https://github.com/oobabooga/torch-grammar.git (from -r ./text-generation-webui/requirements.txt (line 23))
  Cloning https://github.com/oobabooga/torch-grammar.git to /tmp/pip-req-build-4j3wrsjo
  Running command git clone --filter=blob:none --quiet https://github.com/oobabooga/torch-grammar.git /tmp/pip-req-build-4j3wrsjo
  Resolved https://github.com/oobabooga/torch-grammar.git to commit 82850b5383a629f3b0fa1fba7d8f2aba3185ddb2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Ignoring bitsandbytes: markers 'platform_system == "Windows"' don't match your environment
Collecting llama-cpp-python==0.2.11 (from -r ./text-generation-webui/requirements.txt (line 30))
  Downloading https://github.com/abetlen/llama-cpp-python/releases/down

# load Some file to colab local drive

In [17]:
import os
from pathlib import Path
web_ui_path = Path("/content/text-generation-webui/")
model_file_name = "synatra-7b_q8_0.bin"
repo_name = "maywell/Synatra-V0.1-7B-Instruct"
model_name = repo_name.split('/')[-1]
model_file_path = Path("/content/drive/MyDrive/자료") / model_file_name
dest_model_path = web_ui_path / "models" / model_file_name

# if there is no model file in google drive, download, quantize, copy and paste.
if not os.path.exists(model_file_path):
  !git clone https://github.com/ggerganov/llama.cpp.git
  %cd llama.cpp
  !git pull && make clean && LLAMA_CUBLAS=1 make
  !pip install -r requirements.txt
  %cd models
  !git clone https://huggingface.co/{repo_name}.git
  fp16 = f"{model_name}.gguf.fp16.bin"
  %cd ..
  !python convert.py models/{model_name} --outtype f16 --outfile models/{fp16} # convert to fp16
  !rm -rf models/{model_name}
  !./quantize models/{fp16} {dest_model_path} q8_0 # convert to q8
  !cp {dest_model_path} {model_file_path}
  !rm models/{fp16}

In [30]:
assert os.path.exists(model_file_path), model_file_path
if not os.path.exists(dest_model_path):
  !cp {model_file_path} {dest_model_path}
  print("model file has been loaded")
else:
  print("model file was loaded already")
%cd {web_ui_path}

model file was loaded already
/content/text-generation-webui


# LLM Local Model Configuration

https://github.com/oobabooga/text-generation-webui/blob/main/settings-template.yaml

In [31]:
custom_config = {
    "max_new_tokens" : 1024,
    "truncation_length" : 5120,
    "instruction_template" : "Mistral",
}

In [32]:
import yaml
from pprint import pprint

config_template_file_name = "settings-template.yaml"
config_template_path = web_ui_path / config_template_file_name

with open(config_template_path, 'r') as f:
  config  = yaml.safe_load(f)

config = {**config, **custom_config}
pprint(config)

{'add_bos_token': True,
 'auto_max_new_tokens': False,
 'autoload_model': False,
 'ban_eos_token': False,
 'character': 'Assistant',
 'chat-instruct_command': 'Continue the chat dialogue below. Write a single '
                          'reply for the character "<|character|>".\n'
                          '\n'
                          '<|prompt|>',
 'chat_style': 'cai-chat',
 'custom_stopping_strings': '',
 'custom_token_bans': '',
 'dark_theme': True,
 'default_extensions': ['gallery'],
 'instruction_template': 'Mistral',
 'max_new_tokens': 1024,
 'max_new_tokens_max': 4096,
 'max_new_tokens_min': 1,
 'max_tokens_second': 0,
 'mode': 'chat',
 'name1': 'You',
 'negative_prompt': '',
 'preset': 'simple-1',
 'prompt-default': 'QA',
 'prompt-notebook': 'QA',
 'seed': -1,
 'show_controls': True,
 'skip_special_tokens': True,
 'start_with': '',
 'stream': True,
 'truncation_length': 5120,
 'truncation_length_max': 32768,
 'truncation_length_min': 0}


# API extension 설치

In [33]:
!pip install -r ./extensions/api/requirements.txt

Collecting flask_cloudflared==0.0.14 (from -r ./extensions/api/requirements.txt (line 1))
  Downloading flask_cloudflared-0.0.14-py3-none-any.whl (6.4 kB)
Collecting websockets==11.0.2 (from -r ./extensions/api/requirements.txt (line 2))
  Downloading websockets-11.0.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.9/129.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: websockets, flask_cloudflared
  Attempting uninstall: websockets
    Found existing installation: websockets 11.0.3
    Uninstalling websockets-11.0.3:
      Successfully uninstalled websockets-11.0.3
Successfully installed flask_cloudflared-0.0.14 websockets-11.0.2


# Server 구동

In [34]:
n_gpu_layers = 35
n_ctx = 5120

In [None]:
!python server.py --model {model_file_name} --loader llamacpp --n-gpu-layers {str(n_gpu_layers)} --n_ctx {str(n_ctx)} \
--api --public-api

2023-10-17 06:36:18 INFO:[32mLoading synatra-7b_q8_0.bin...[0m
2023-10-17 06:36:18 INFO:[32mllama.cpp weights detected: models/synatra-7b_q8_0.bin[0m
2023-10-17 06:36:18 INFO:[32mCache capacity is 0 bytes[0m
ggml_init_cublas: found 1 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5
llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from models/synatra-7b_q8_0.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q8_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q8_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q8_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q8_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q8_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - ten

# Colab-SSH 설치 및 실행

In [None]:
# !pip install colab-ssh

In [None]:
# from colab_ssh import launch_ssh
# launch_ssh(NGROK_TOKEN, PASSWORD)

In [None]:
# import numpy as np

# I = np.eye(3)

# while True:
#   I = I@I