In [158]:
import os
from typing import Any, Callable, Dict, Optional, Sequence

import requests
from llama_index.core.base.llms.types import (
    ChatMessage,
    ChatResponse,
    ChatResponseGen,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.core.bridge.pydantic import Field, PrivateAttr
from llama_index.core.callbacks import CallbackManager
from llama_index.core.constants import (
    DEFAULT_CONTEXT_WINDOW,
    DEFAULT_NUM_OUTPUTS,
    DEFAULT_TEMPERATURE,
)
from llama_index.core.llms.callbacks import llm_chat_callback, llm_completion_callback
from llama_index.core.llms.custom import CustomLLM
from llama_index.core.base.llms.generic_utils import (
    completion_response_to_chat_response,
    stream_completion_response_to_chat_response,
)
from llama_index.core.types import BaseOutputParser, PydanticProgramMode
from llama_index.core.utils import get_cache_dir
from tqdm import tqdm

from llama_cpp import Llama

DEFAULT_LLAMA_CPP_MODEL_VERBOSITY = True


class LlamaCPP(CustomLLM):
    context_window: int = 3900
    max_new_tokens: int = 512
    num_output: int = 256
    temperature: float = 0.2    
    verbose: bool = True
    chat_handler: Optional[Callable] = None
    model_name: str = "custom"
    dummy_response: str = "My response"

    def set_model(self, 
                repo_id:str,
                filename:str,
                n_ctx=2048,
                max_new_tokens=512,
                temperature=0.2,
                chat_handler=None,
                verbose=True
        ):
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.verbose = verbose
        self.chat_handler = chat_handler

        self._model = Llama.from_pretrained(
            repo_id=repo_id,
            filename=filename,
            n_ctx=n_ctx,
        )
        

    @classmethod
    def class_name(cls) -> str:
        return "LlamaCPP_llm"

    @property
    def metadata(self) -> LLMMetadata:
        """LLM metadata."""
        return LLMMetadata(
            context_window=self._model.context_params.n_ctx,
            num_output=self.max_new_tokens,
            model_name=self.model_name,
        )

  
    @llm_chat_callback()
    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
        prompt = self.messages_to_prompt(messages)
        completion_response = self.complete(prompt, formatted=True, **kwargs)
        return completion_response_to_chat_response(completion_response)

    @llm_chat_callback()
    def stream_chat(
        self, messages: Sequence[ChatMessage], **kwargs: Any
    ) -> ChatResponseGen:
        prompt = self.messages_to_prompt(messages)
        completion_response = self.stream_complete(prompt, formatted=True, **kwargs)
        return stream_completion_response_to_chat_response(completion_response)

    @llm_completion_callback()
    def complete(
        self, prompt: str, formatted: bool = False, **kwargs: Any
    ) -> CompletionResponse:
        response = self._model.create_chat_completion(
            messages=[{"role": "user", "content": prompt}], stream=False
        )

        return CompletionResponse(text=response['choices'][0]['message']['content'], raw=response)

    @llm_completion_callback()
    def stream_complete(
        self, prompt: str, formatted: bool = False, **kwargs: Any
    ) -> CompletionResponseGen:
        response_iter = self._model.create_chat_completion(
                messages=[{"role": "user", "content": prompt}], stream=True
            )

        def gen() -> CompletionResponseGen:
            text = ""
            for chunk in response_iter:
                delta = chunk["choices"][0]["delta"]
                text_delta = ""
                if "content" in delta:
                    text += delta["content"]
                    text_delta = delta["content"]
                yield CompletionResponse(delta=text_delta, text=text, raw=chunk)

        return gen()





In [134]:
# loaded_model = Llama.from_pretrained(
#                     repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
#                     filename="*q4_k_m.gguf",
#                     n_ctx=2048,
#                 )

In [150]:
# prompt = "Who is Messi?"
# response_iter = loaded_model.create_chat_completion(
#                 messages=[{"role": "user", "content": prompt}], stream=False
#             )

In [159]:
# response_iter['choices'][0]['message']['content']

In [136]:
# for chunk in response_iter:
#     delta = chunk["choices"][0]["delta"]
#     print(delta)
#     if "role" in delta:
#         print(delta["role"], end=": ")
#     elif "content" in delta:
#         print(delta["content"], end="")
#         # self.update_signal.emit(delta["content"])

In [161]:
model = LlamaCPP(max_new_tokens=512)
model.set_model(repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF", filename="*q4_k_m.gguf", n_ctx=2048)

llama_model_loader: loaded meta data with 30 key-value pairs and 147 tensors from /Users/namnguyenthe/.cache/huggingface/hub/models--hugging-quants--Llama-3.2-1B-Instruct-Q4_K_M-GGUF/snapshots/7d1f70022fcab2038000074bd0342e03e1d8b755/./llama-3.2-1b-instruct-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 1B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 

In [162]:
response = model.complete("who is cristiano ronaldo?")


llama_print_timings:        load time =    2873.42 ms
llama_print_timings:      sample time =      35.63 ms /   421 runs   (    0.08 ms per token, 11814.89 tokens per second)
llama_print_timings: prompt eval time =    2872.78 ms /    18 tokens (  159.60 ms per token,     6.27 tokens per second)
llama_print_timings:        eval time =   57238.96 ms /   420 runs   (  136.28 ms per token,     7.34 tokens per second)
llama_print_timings:       total time =   60892.92 ms /   438 tokens


In [140]:
response_iter = model.stream_complete("Hello, how are you?")
for response in response_iter:
    # print(response)
    print(response.delta, end="", flush=True)

Llama.generate: 15 prefix-match hit, remaining 1 prompt tokens to eval


I'm just a language model, I don't have emotions or feelings, but thank you for asking! How can I assist you today?


llama_print_timings:        load time =    2652.80 ms
llama_print_timings:      sample time =       2.05 ms /    29 runs   (    0.07 ms per token, 14167.07 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (     nan ms per token,      nan tokens per second)
llama_print_timings:        eval time =    4799.83 ms /    29 runs   (  165.51 ms per token,     6.04 tokens per second)
llama_print_timings:       total time =    4862.26 ms /    29 tokens
