In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# requirements

!pip install -r drive/MyDrive/requirements.txt

Collecting lightning@ git+https://github.com/Lightning-AI/lightning@master (from -r drive/MyDrive/requirements/chatbot.txt (line 2))
  Cloning https://github.com/Lightning-AI/lightning (to revision master) to /tmp/pip-install-ao4row8x/lightning_531e5e9e3b28438d992e880543273f20
  Running command git clone --filter=blob:none --quiet https://github.com/Lightning-AI/lightning /tmp/pip-install-ao4row8x/lightning_531e5e9e3b28438d992e880543273f20
  Resolved https://github.com/Lightning-AI/lightning to commit c5a731c3cdf1bc29cb9323511dda2528c3ab5835
  Running command git submodule update --init --recursive -q
  Encountered 22 file(s) that should have been pointers, but weren't:
        .notebooks/course_UvA-DL/01-introduction-to-pytorch.ipynb
        .notebooks/course_UvA-DL/02-activation-functions.ipynb
        .notebooks/course_UvA-DL/03-initialization-and-optimization.ipynb
        .notebooks/course_UvA-DL/04-inception-resnet-densenet.ipynb
        .notebooks/course_UvA-DL/05-transformers-a

In [3]:
# change dir

%cd drive/MyDrive/ChatBot

/content/drive/MyDrive/ChatBot


In [4]:
# external path

from pathlib import Path
import lightning as L
import torch

from lit_llama import LLaMA, Tokenizer
from lit_llama.utils import EmptyInitOnDevice

from fastapi import FastAPI
from fastapi.responses import JSONResponse
from pydantic import BaseModel

import nest_asyncio
from pyngrok import ngrok
import uvicorn

class ChatBot:
    def __init__(self, model, tokenizer, fabric):
        self.model = model
        self.tokenizer = tokenizer
        self.fabric = fabric
        self.history = []

    def generate_prompt(self, example):
        if example["input"]:
            return (
                "아래는 작업을 설명하는 명령어와 추가적 맥락을 제공하는 입력이 짝을 이루는 예제입니다.\n\n"
                "요청을 적절히 완료하는 응답을 작성하세요.\n\n"
                f"### 명령어:\n{example['instruction']}\n\n### 입력:\n{example['input']}\n\n### 응답:"
            )
        return (
            "환자가 의사에게 아픈 곳에 대해 문의합니다.\n\n"
            "환자의 문의 내용에 대해 답변하세요. 환자의 질병을 진단하고, 가능하면 처방을 하세요. \n\n"
            f"### 문의:\n{example['instruction']}\n\n### 응답:"
    )

    # default generation
    @torch.no_grad()
    def generate(
        self,
        idx,
        max_new_tokens,
        max_seq_length=None,
        temperature=0.8,
        top_k=None,
        eos_id=None,
        repetition_penalty=1.1,
        early_stopping=True,
    ):
        T = idx.size(0)
        T_new = T + max_new_tokens
        if max_seq_length is None:
            max_seq_length = min(T_new, self.model.config.block_size)

        device, dtype = idx.device, idx.dtype
        # create an empty tensor of the expected final shape and fill in the current tokens
        empty = torch.empty(T_new, dtype=dtype, device=device)
        empty[:T] = idx
        idx = empty
        input_pos = torch.arange(0, T, device=device)

        if idx.device.type == "xla":
            import torch_xla.core.xla_model as xm

            xm.mark_step()

        # generate max_new_tokens tokens
        for _ in range(max_new_tokens):
            x = idx.index_select(0, input_pos).view(1, -1)

            # forward
            logits = self.model(x, max_seq_length, input_pos)
            logits = logits[0, -1] / temperature

            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits = torch.where(logits < v[[-1]], -float("Inf"), logits)

            probs = torch.nn.functional.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1).to(dtype=dtype)

            # advance
            input_pos = input_pos[-1:] + 1

            if idx.device.type == "xla":
                xm.mark_step()

            # concatenate the new generation
            idx = idx.index_copy(0, input_pos, idx_next)

            # if <eos> token is triggered, return the output (stop generation)
            if idx_next == eos_id:
                return idx[:input_pos]  # include the EOS token

        return idx

    # LLM generation 함수
    def ans(self, user_message, max_new_tokens, top_k, temperature):
        self.history = self.history + [[user_message, None]]
        instruction = self.history[-1][0].strip()
        sample = { "instruction" : instruction, "input" : None }
        prompt = self.generate_prompt(sample)
        encoded_prompt = self.tokenizer.encode(prompt, bos=True, eos=False, device=self.fabric.device)

        y = self.generate(
            idx=encoded_prompt,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
            eos_id=self.tokenizer.eos_id
        )

        self.model.reset_cache()

        response = self.tokenizer.decode(y)
        response = response.split('응답:')[1].strip().replace('�', '')

        # history 업데이트
        self.history[-1][1] = response
        return response

def load_model():
    # Settings for inference
    # Precision setting for float32 matmul operations. It's important for some CUDA devices.
    torch.set_float32_matmul_precision("high")

    checkpoint_path = Path("checkpoints/lit-llama/7B/lit-llama.pth")
    tokenizer_path = Path("checkpoints/lit-llama/tokenizer.model")
    quantize = None  # "gptq.int4" or "llm.int8"

    fabric = L.Fabric(devices=1)
    dtype = torch.bfloat16 if fabric.device.type == "cuda" and torch.cuda.is_bf16_supported() else torch.float32

    with EmptyInitOnDevice(device=fabric.device, dtype=dtype, quantization_mode=quantize):
        model = LLaMA.from_name("7B")

    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint)

    model.eval()
    model = fabric.setup_module(model)

    tokenizer = Tokenizer(tokenizer_path)

    return model, tokenizer, fabric

class DataItem(BaseModel):
    message: str

class CustomFastAPI(FastAPI):
    def __init__(self, chat_bot):
          super().__init__()
          self.input_msg = ""
          self.chat_bot = chat_bot
          self.add_routes()

    def add_routes(self):
        @self.post("/post_data")
        async def post_data(data: DataItem):
            print("user input:", data.message)
            self.input_msg = data.message
            response = {'status' : 'success'}
            return response

        @self.get("/get_data")
        def get_data():
          ans = self.chat_bot.ans(self.input_msg, 512, 200, 0.5)
          ans_dict = {'generated_ans' : ans}
          return ans_dict

def main():
    # 모델, 토크나이저 로드
    model, tokenizer, fabric = load_model()

    # 챗봇 객체 생성
    chat_bot = ChatBot(model, tokenizer, fabric)

    app = CustomFastAPI(chat_bot)

    nest_asyncio.apply()  # ASGI server

    public_url = ngrok.connect(8000)  # FastAPI server
    print("FastAPI server is available at:", public_url)
    uvicorn.run(app, host="0.0.0.0", port=8000)

if __name__ == "__main__":
    main()



INFO:     Started server process [1267]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


FastAPI server is available at: NgrokTunnel: "https://c578-34-143-163-154.ngrok.io" -> "http://localhost:8000"
user input: 두통이 너무 심해요.
INFO:     210.101.130.131:0 - "POST /post_data HTTP/1.1" 200 OK
INFO:     210.101.130.131:0 - "GET /get_data HTTP/1.1" 200 OK
user input: 복통이 너무 심해요.
INFO:     210.101.130.131:0 - "POST /post_data HTTP/1.1" 200 OK
INFO:     210.101.130.131:0 - "GET /get_data HTTP/1.1" 200 OK
user input: 두통이 너무 심해요.
INFO:     210.101.130.131:0 - "POST /post_data HTTP/1.1" 200 OK
INFO:     210.101.130.131:0 - "GET /get_data HTTP/1.1" 200 OK
user input: 배가 너무 아파요.
INFO:     210.101.130.131:0 - "POST /post_data HTTP/1.1" 200 OK
INFO:     210.101.130.131:0 - "GET /get_data HTTP/1.1" 200 OK
user input: 복통이 너무 심해요.
INFO:     210.101.130.131:0 - "POST /post_data HTTP/1.1" 200 OK
INFO:     210.101.130.131:0 - "GET /get_data HTTP/1.1" 200 OK
user input: 복통이 너무 심해요
INFO:     210.101.130.131:0 - "POST /post_data HTTP/1.1" 200 OK
INFO:     210.101.130.131:0 - "GET /get_data HTTP/1.1

INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1267]


In [None]:
# evaluation

!pip install fire
!pip install peft

"""Run evaluation of medAlpaca models on the USMLE self assessment.
The questions can be downloaded at: https://huggingface.co/medalapca/

Example evaluation:

Assume you have downloaded the steps into the folder "usmle" you can now evaluate
a medalpaca model:

```bash
export HF_HOME=/path/to/hf_cache

python eval_usmle.py \
    --model_name 'medalpaca/medalpaca-lora-13b-8bit' \
    --prompt_template '../medalpaca/prompt_templates/medalpaca.json' \
    --base_model 'decapoda-research/llama-13b-hf' \
    --peft True \
    --load_in_8bit True \
    --path_to_exams 'data/test/'

This will create three new files in 'data/test', named stepX_MODELNAME.json.

The generation methods it hardcoded to the `sampling` dict, feel free to adapt this

"""
!python drive/MyDrive/eval_usmle.py \
    --model_name 'drive/MyDrive/model/finetuned/7B' \
    --prompt_template 'drive/MyDrive/templates/alpaca_kr.json' \
    --base_model 'decapoda-research/llama-7b-hf' \
    --peft True \
    --load_in_8bit True \
    --path_to_exams 'data/test/'