In [1]:
!pip install llama-cpp-python

Defaulting to user installation because normal site-packages is not writeable


In [6]:
import pandas as pd

# Load the tokenized dataset from the specified directory
test_df = pd.read_csv("./checklist_career_chatbot_test_cases.csv")

In [4]:
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Q4_K_M
model_path = hf_hub_download(
    repo_id="sharshar20/llama3.2_3B_instruct-GGUF-v7",
    filename="ft-q4_k_m-lora.gguf"
)

llm_q4km_lora = Llama(
    model_path=model_path,
    n_ctx=1024,        # context window
    n_threads=2,       # adjust for your CPU
    verbose=False
)

ft-q4_k_m-lora.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


In [8]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer

instruction = """You are a top-rated NTU career advisor chatbot.
Be polite, concise, and helpful in providing career guidance responses."""

# Add output columns
test_df["model_output"] = ""

# Apply same chat template function as training
def apply_chat_template(user_input):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": user_input}
    ]
    # Generate the same formatted prompt as used during fine-tuning
    prompt = tokenizer.apply_chat_template(row_json, tokenize=False)
    return prompt

# Generate responses using the same chat format
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    prompt = apply_chat_template(row["input"]) 

    try:
        output = llm_q4km_lora(
            prompt,
            max_tokens=128,
            temperature=0.7,
            top_p=0.9,
            stop=["<|eot_id|>", "</s>"]
        )
        response = output["choices"][0]["text"].strip()
    except Exception as e:
        print(f"Error at row {i}: {e}")
        response = ""

    test_df.at[i, "model_output"] = response

# Save results
test_df.to_csv("career_chatbot_test_results_q4km_lora_v7.csv", index=False)

100%|██████████| 70/70 [32:54<00:00, 28.21s/it]


NameError: name 'files' is not defined

In [5]:
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Q5_K_M
model_path = hf_hub_download(
    repo_id="sharshar20/llama3.2_3B_instruct-GGUF-v7",
    filename="ft-q5_k_m-lora.gguf"
)

llm_q5km_lora = Llama(
    model_path=model_path,
    n_ctx=1024,        # context window
    n_threads=2,       # adjust for your CPU
    verbose=False
)

ft-q5_k_m-lora.gguf:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


In [10]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer

instruction = """You are a top-rated NTU career advisor chatbot.
Be polite, concise, and helpful in providing career guidance responses."""

# Add output columns
test_df["model_output"] = ""

# Apply same chat template function as training
def apply_chat_template(user_input):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": user_input}
    ]
    # Generate the same formatted prompt as used during fine-tuning
    prompt = tokenizer.apply_chat_template(row_json, tokenize=False)
    return prompt

# Generate responses using the same chat format
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    prompt = apply_chat_template(row["input"]) 

    try:
        output = llm_q5km_lora(
            prompt,
            max_tokens=128,
            temperature=0.7,
            top_p=0.9,
            stop=["<|eot_id|>", "</s>"]
        )
        response = output["choices"][0]["text"].strip()
    except Exception as e:
        print(f"Error at row {i}: {e}")
        response = ""

    test_df.at[i, "model_output"] = response

# Save results
test_df.to_csv("career_chatbot_test_results_q5km_lora_v7.csv", index=False)

100%|██████████| 70/70 [56:18<00:00, 48.27s/it] 


In [11]:
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Q8_0
model_path = hf_hub_download(
    repo_id="sharshar20/llama3.2_3B_instruct-GGUF-v7",
    filename="ft-q8_0-lora.gguf"
)

llm_q8_0_lora = Llama(
    model_path=model_path,
    n_ctx=1024,        # context window
    n_threads=2,       # adjust for your CPU
    verbose=False
)

ft-q8_0-lora.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


In [12]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer

instruction = """You are a top-rated NTU career advisor chatbot.
Be polite, concise, and helpful in providing career guidance responses."""

# Add output columns
test_df["model_output"] = ""

# Apply same chat template function as training
def apply_chat_template(user_input):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": user_input}
    ]
    # Generate the same formatted prompt as used during fine-tuning
    prompt = tokenizer.apply_chat_template(row_json, tokenize=False)
    return prompt

# Generate responses using the same chat format
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    prompt = apply_chat_template(row["input"]) 

    try:
        output = llm_q8_0_lora(
            prompt,
            max_tokens=128,
            temperature=0.7,
            top_p=0.9,
            stop=["<|eot_id|>", "</s>"]
        )
        response = output["choices"][0]["text"].strip()
    except Exception as e:
        print(f"Error at row {i}: {e}")
        response = ""

    test_df.at[i, "model_output"] = response

# Save results
test_df.to_csv("career_chatbot_test_results_q8_0_lora_v7.csv", index=False)

100%|██████████| 70/70 [44:27<00:00, 38.11s/it]
