## Local (CPU-based) ChatBot

### Using HuggingFace `transformers` package and Ollama

Just testing out CPU-based LLM install and implementations.

In [16]:
# Install Ollama app via download @ https://ollama.com/download
# Allows running of local (downloaded) LLM models (like Llama 3)
# Install packages: uv add transformers torch ollama
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import list_models
# import torch

# import ollama # if running directly
# terminal: ollama run llama3

In [15]:
models = list_models(filter="text-generation", search="chat", sort="downloads", direction=-1, limit=10)

for m in models:
    print(m.modelId)


TinyLlama/TinyLlama-1.1B-Chat-v1.0
meta-llama/Llama-2-7b-chat-hf
deepseek-ai/DeepSeek-V2-Lite-Chat
deepseek-ai/deepseek-llm-7b-chat
Qwen/Qwen1.5-4B-Chat
Qwen/Qwen1.5-0.5B-Chat
meta-llama/Llama-2-13b-chat-hf
TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ
Qwen/Qwen-7B-Chat
TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ


In [9]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [10]:
# Load tokenizer for model (subword-level tokenization given model's vocabulary)
# Keeps breaking down words until gets recognizable tokens
# For instance, say "chatbot" is split into "chat" + "bot"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load model into CPU memory
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cpu")

In [11]:
 #Build the text-generation pipeline (tokenize -> predict -> detokenize)
chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cpu


In [None]:
print("My CPU-based Chatbot (type 'exit' to quit)")
history = "" # store conversation history

while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        print("Signing you off, Shepard")
        break

    print(f'User: {user_input}\n')

     # build a simple prompt with system role + conversation history + user input
    prompt = f"<|system|>\nYou are a super helpful assistant.\n{history}<|user|>\n{user_input}<|assistant|>\n"

    # temperature (smaller conservative, larger creative) via divisor in softmax function
    response = chatbot(prompt, max_new_tokens=150, temperature=0.7)[0]['generated_text']

    # extract assistant's part of the response only
    assistant_response = response.split("<|assistant|>")[-1].strip()
    print("Assistant:", assistant_response)

    # save exchange into conversation history for next time
    history += f"<|user|>\n{user_input}<|assistant|>\n{assistant_response}\n"

In [None]:
# from llama_cpp import Llama

# llm = Llama(model_path="/path/to/ggml-model-q4_0.bin")

# def ask(prompt):
#     resp = llm.create_completion(prompt=prompt, max_tokens=150, temperature=0.7)
#     return resp["choices"][0]["text"]

# # in your loop:
# response = ask(prompt)
# assistant_response = response.split("<|assistant|>")[-1].strip()