In [None]:
!pip install pypdf langchain langchain-community sentence-transformers faiss-cpu transformers torch -q
# -q for quiet installation

In [None]:
# notebooks/2_llm_testing_and_prompt_engineering.ipynb

import os
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser

# --- Configuration ---
# Make sure you have enough VRAM/RAM for this model!
LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"

# Your Hugging Face access token
# IMPORTANT: Replace "YOUR_HF_TOKEN_HERE" with your actual token.
# For Google Colab, it's generally recommended to use Colab Secrets (the key icon on the left)
# and set the secret name to HF_TOKEN. If you do that, you don't need this line.
# However, for direct testing or if secrets aren't working, passing it explicitly can help.
HF_ACCESS_TOKEN = "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" # <--- REPLACE WITH YOUR ACTUAL TOKEN

# --- Load LLM (same logic as in src/generator.py) ---
print(f"Loading tokenizer: {LLM_MODEL_ID}")
# Pass the token explicitly to from_pretrained
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, token=HF_ACCESS_TOKEN)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Set pad_token to eos_token for the tokenizer.")

print(f"Loading model: {LLM_MODEL_ID}. This may take some time and requires ~14GB VRAM/RAM.")
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda" and torch.cuda.get_device_capability()[0] >= 8: # Check for Ampere or newer
    torch_dtype = torch.bfloat16
    print(f"Using bfloat16 on {device}.")
else:
    torch_dtype = torch.float16 if device == "cuda" else torch.float32
    print(f"Using {torch_dtype} on {device}.")

# Pass the token explicitly to from_pretrained
model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL_ID,
    torch_dtype=torch_dtype,
    device_map="auto",
    token=HF_ACCESS_TOKEN # <--- Pass the token here as well
)
model.eval()

print("Setting up text-generation pipeline...")
llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.1,
    top_p=0.9,
    do_sample=True,
    return_full_text=False,
    pad_token_id=tokenizer.pad_token_id,
)
llm = HuggingFacePipeline(pipeline=llm_pipeline)
print("LLM and pipeline loaded.")

# --- Prompt Template Experimentation ---

# Version 1: Simple Prompt
print("\n--- Testing Simple Prompt ---")
simple_template = """
[INST] Question: {question}
Context: {context} [/INST]
"""
simple_prompt = PromptTemplate.from_template(simple_template)

simple_chain = (
    simple_prompt
    | llm
    | StrOutputParser()
)

context_for_test = """
The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.
It was constructed from 1887 to 1889 as the entrance to the 1889 World's Fair.
"""
question_for_test = "When was the Eiffel Tower built and where is it located?"

response_simple = simple_chain.invoke({"question": question_for_test, "context": context_for_test})
print(f"Question: {question_for_test}")
print(f"Response: {response_simple.strip()}")
print("-" * 50)

# Version 2: More Instructive Prompt (Similar to app.py/generator.py)
print("\n--- Testing Instructive Prompt ---")
instructive_template = """
[INST] You are an intelligent AI assistant dedicated to providing accurate and concise answers based solely on the provided context.
If the question cannot be answered from the provided context, please state "I cannot answer this question based on the provided information." Do not use outside knowledge.

Question: {question}

Context: {context} [/INST]
"""
instructive_prompt = PromptTemplate.from_template(instructive_template)

instructive_chain = (
    instructive_prompt
    | llm
    | StrOutputParser()
)

question_unanswerable = "What is the airspeed velocity of an unladen swallow?"
response_unanswerable = instructive_chain.invoke({"question": question_unanswerable, "context": context_for_test})
print(f"Question: {question_unanswerable}")
print(f"Response: {response_unanswerable.strip()}")
print("-" * 50)

# Another test with a different context
print("\n--- Testing another scenario with Instructive Prompt ---")
another_context = """
Quantum computing is a type of computing that uses quantum-mechanical phenomena, such as superposition and entanglement, to perform operations on data.
Unlike classical computers that use bits which can be either 0 or 1, quantum computers use qubits that can represent 0, 1, or both simultaneously.
"""
another_question = "Explain qubits in simple terms."
response_another = instructive_chain.invoke({"question": another_question, "context": another_context})
print(f"Question: {another_question}")
print(f"Response: {response_another.strip()}")
print("-" * 50)

print("\nPrompt engineering experimentation complete. You can modify the templates and contexts above to further refine your prompt.")