<a href="https://colab.research.google.com/github/sajabdoli/car_manual/blob/main/RAG_car_handling_Llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Test the simple RAG model with Llama

In [None]:
!pip install sentence_transformers
!pip install faiss-gpu
!pip install -U bitsandbytes

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
from sentence_transformers import SentenceTransformer
import pandas as pd
import faiss
import os
from tqdm import tqdm
import numpy as np
import torch

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Read the Markdown file
with open('/content/sample_data/cleaned_car_manual.md', 'r') as file:
    markdown_text = file.read()

# Split the text into sections
text_sections = markdown_text.split('\n\n')  # This splits by double newlines

# Load Sentence Transformer model
st_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the text sections with a progress bar
tqdm.pandas(desc="Encoding Text")
embeddings = [st_model.encode(text, show_progress_bar=False) for text in tqdm(text_sections)]

# Convert embeddings to a 2D numpy array
embeddings_array = np.array(embeddings)

# Create a faiss index
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance
index.add(embeddings_array)

100%|██████████| 2512/2512 [00:16<00:00, 148.51it/s]


In [None]:
# Retrieval function
def retrieve_docs(query, top_k=5):
    query_embedding = st_model.encode([query])
    query_embedding = np.array(query_embedding).astype('float32')

    distances, indices = index.search(query_embedding, top_k)

    retrieved_sections = [text_sections[i] for i in indices[0]]

    return retrieved_sections

In [None]:
# Load the text generation model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipeline = transformers.pipeline(
    "text-generation",
    pad_token_id=tokenizer.eos_token_id,
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16, "load_in_4bit": True}
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
def generate_response(query):
    # Retrieve relevant documents and create context
    docs = retrieve_docs(query)
    context = " ".join(docs)


    # Prepare messages with context for chat template
    messages = [
        {"role": "system", "content": "You are a helpful assitant for the user of Toyota verena"},
        {"role": "system", "content": f"Context: {context}"},
        {"role": "user", "content": query},
    ]

    # Generate prompt
    prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )


    # Ensure prompt is not None
    if prompt is None:
        raise ValueError("The prompt could not be generated. Check the chat template and input data.")

    eos_token_id = pipeline.tokenizer.eos_token_id


    # Generate response using the pipeline, removing the 'stream' argument
    outputs = pipeline(
        prompt,
        max_new_tokens=256,
        eos_token_id=eos_token_id,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    generated_text = outputs[0]['generated_text'][len(prompt):] # Assuming the output format is a list with a dictionary
    return generated_text

In [None]:
while True:
    user_input = input("User: ")
    if user_input.lower() == 'exit':
        break
    response = generate_response(user_input)
    print("Assistant:", response)

User: Hi, Introduce yourself please.
Assistant: Nice to meet you! I'm your helpful assistant for the Toyota Verona. I'm here to assist you with any questions, concerns, or tasks related to your Toyota Verona. Whether you're looking for information on maintenance, troubleshooting, or simply want to know more about your vehicle's features, I'm here to help. What can I assist you with today?
