If you're not running in Saturn Cloud, you need to install these libraries:

Make sure you use the latest versions

```
pip install -U transformers accelerate bitsandbytes sentencepiece
```

In [None]:
# Google Colab Notebook init cell
!pip install -U transformers accelerate bitsandbytes sentencepiece

In [2]:
# Google Colab Notebook init cell
!rm -f minsearch.py
!wget https://github.com/quickSilverShanks/DataWhiz-Chat/raw/main/01__Intro/minsearch.py

--2024-06-30 13:35:54--  https://github.com/quickSilverShanks/DataWhiz-Chat/raw/main/01__Intro/minsearch.py
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/quickSilverShanks/DataWhiz-Chat/main/01__Intro/minsearch.py [following]
--2024-06-30 13:35:54--  https://raw.githubusercontent.com/quickSilverShanks/DataWhiz-Chat/main/01__Intro/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3838 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-06-30 13:35:54 (66.6 MB/s) - ‘minsearch.py’ saved [3838/3838]



In [3]:
import os
import requests
import minsearch

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [4]:
# If running on windows in local, use line below to change hf model download location
# hf_cache_dir = "D:\.cache\huggingface"
# os.environ['HF_HOME'] = "D:\.cache\huggingface"

In [5]:
docs_url = "https://github.com/quickSilverShanks/DataWhiz-Chat/raw/main/01__Intro/documents_upd29jun.json"
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7e09fe856830>

In [6]:
# Use this parameter in both the functions below if running on windows in local : cache_dir=hf_cache_dir
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [8]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [9]:
def rag(query, llm_gen):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm_gen(prompt)
    return answer

In [13]:
def llm_basic(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    # input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, )
    result = tokenizer.decode(outputs[0])
    return result

In [18]:
def llm(prompt, generate_params=None):
    if generate_params is None:
        generate_params = {}

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    # input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(
        input_ids,
        max_length=generate_params.get("max_length", 100),
        num_beams=generate_params.get("num_beams", 5),
        do_sample=generate_params.get("do_sample", False),
        temperature=generate_params.get("temperature", 1.0),
        top_k=generate_params.get("top_k", 50),
        top_p=generate_params.get("top_p", 0.95),
    )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result


In [15]:
rag("I just discovered the course. Can I still join it?", llm_basic)



"<pad>Yes, even if you don't register, you're still eligible to submit the"

In [16]:
rag("I just discovered the course. Can I still join it?", llm)



"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."