If you're not running in Saturn Cloud, you need to install these libraries:

Make sure you use the latest versions

```
pip install -U transformers accelerate bitsandbytes
```

In [1]:
!pip install -U -qqq transformers accelerate bitsandbytes

If you're running in Saturn Cloud, change the HuggingFace model folder to '/run/cache/' to ensure you have enough space. 

In [2]:
import os

os.environ["HF_HOME"] = "/run/cache/"

Download `minsearch.py` to create a document search engine 

In [3]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-07 14:13:18--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-07 14:13:19 (44.3 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [4]:
from typing import Optional

from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    T5Tokenizer,
    T5ForConditionalGeneration,
    pipeline,
)
import torch
import requests

import minsearch

login(token=os.environ["HF_TOKEN"])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /run/cache/token
Login successful


# Create a document search engine

In [5]:
docs_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1"
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"], keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7f45741b3a90>

# Helper functions

In [6]:
def clear_gpu_memory() -> None:
    global tokenizer
    global model

    if "tokenizer" in globals():
        del tokenizer
    if "model" in globals():
        del model
    torch.cuda.empty_cache()


def get_tokenizer_model(model_name: str):
    clear_gpu_memory()  # Release allocated memory

    if model_name == "google/flan-t5-xl":
        tokenizer = T5Tokenizer.from_pretrained(model_name)
        model = T5ForConditionalGeneration.from_pretrained(
            model_name, device_map="auto"
        )
    elif model_name == "mistralai/Mistral-7B-v0.1":
        tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
        model = AutoModelForCausalLM.from_pretrained(
            model_name, device_map="auto", load_in_4bit=True
        )
    elif model_name == "microsoft/Phi-3-mini-128k-instruct":
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name, device_map="cuda", torch_dtype="auto", trust_remote_code=True
        )
    else:
        raise ValueError("The model is not supported.")

    return tokenizer, model


def search_docs(
    query: str, course: str, num_results: int = 3, boost: Optional[dict] = None
):
    if not boost:
        boost = {"question": 3.0, "section": 0.5}

    docs = index.search(
        query=query,
        filter_dict={"course": course},
        boost_dict=boost,
        num_results=num_results,
    )

    return docs


def build_prompt(
    question: str, response_docs: list[dict[str, str]], model_name: str
) -> str:
    if model_name == "mistralai/Mistral-7B-v0.1":
        prompt_template = """
QUESTION: {question}

CONTEXT:
{context}

ANSWER:
""".strip()
        context = ""
        for doc in response_docs:
            context += f"{doc['question']}\n{doc['text']}\n\n"
    else:
        prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
    """.strip()
        context = ""
        for doc in response_docs:
            context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return prompt_template.format(question=question, context=context)


def llm(prompt: str, model_name: str, tokenizer, model, generation_params=None) -> str:
    if generation_params is None:
        generation_params = {}

    if model_name == "google/flan-t5-xl":
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
        outputs = model.generate(
            input_ids,
            max_length=generation_params.get("max_length", 100),
            num_beams=generation_params.get("num_beams", 5),
            do_sample=generation_params.get("do_sample", False),
            temperature=generation_params.get("temperature", 1.0),
            top_k=generation_params.get("top_k", 50),
            top_p=generation_params.get("top_p", 0.95),
        )
        output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    elif model_name == "mistralai/Mistral-7B-v0.1":
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
        )
        outputs = pipe(prompt, **generation_params)
        output = outputs[0]["generated_text"][len(prompt) :].strip()
    elif model_name == "microsoft/Phi-3-mini-128k-instruct":
        messages = [
            {"role": "user", "content": prompt},
        ]
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
        )
        outputs = pipe(messages, **generation_params)
        output = outputs[0]["generated_text"].strip()
    else:
        raise ValueError("The model is not supported.")

    return output


def rag(
    query: str, course: str, model_name: str, tokenizer, model, generation_params=None
):
    search_results = search_docs(query, course=course)
    prompt = build_prompt(query, search_results, model_name)
    return llm(prompt, model_name, tokenizer, model, generation_params)

In [7]:
query = "I just discovered this course. Can I still join it?"
course = "data-engineering-zoomcamp"

# Model: google/flan-t5-xl

In [8]:
model_name = "google/flan-t5-xl"
tokenizer, model = get_tokenizer_model(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
rag(query, course, model_name, tokenizer, model)



"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

# Model: microsoft/Phi-3-mini-128k-instruct

In [10]:
model_name = "microsoft/Phi-3-mini-128k-instruct"
tokenizer, model = get_tokenizer_model(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
generation_params = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

rag(query, course, model_name, tokenizer, model, generation_params)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


"Yes, you can still join the course even if you discover it after the start date. You are eligible to submit homeworks even if you don't register. However, there are deadlines for final projects, so it's important not to wait until the last minute."

# Model: mistralai/Mistral-7B-v0.1

In [12]:
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer, model = get_tokenizer_model(model_name)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
generation_params = {
    "max_new_tokens": 500,
    "temperature": 0.7,
    "top_p": 0.95,
    "num_return_sequences": 1,
}

rag(query, course, model_name, tokenizer, model, generation_params)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'Yes, you can still join the course.'