This document is inspired by Madhav Thaker's [tutorial](https://medium.com/@thakermadhav/build-your-own-rag-with-mistral-7b-and-langchain-97d0c92fa146)

First, we need to change the environment to use the GPU:

1. Entorno de ejecucción
2. Cambiar entorno de ejecución
3. Seleccionar T4 GPU





## Load dependencies

In [2]:
!pip install -q -U torch datasets transformers tensorflow langchain playwright html2text sentence_transformers faiss-cpu
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m794.4/794.4 kB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.2/37.2 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m44.7 

In [3]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-85bjfash
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-85bjfash
  Resolved https://github.com/huggingface/transformers to commit fa21ead73db473d88f8eca1ec244aba776fd9047
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.37.0.dev0-py3-none-any.whl size=8281619 sha256=2a75ddec47f71a69106cdcd62b532c8752e74d7894232c9d23a2a48b06e11199
  Stored in directory: /tmp/pip-ephem-wheel-cache-id0r5bo4/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers

## Load a quantized Mistral-7B Model

In [1]:
# Imports
import os
import torch
import nest_asyncio

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
from peft import LoraConfig, PeftModel

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

In [3]:
#################################################################
# Tokenizer
#################################################################

model_name='mistralai/Mistral-7B-Instruct-v0.1'
# model_name = 'mistralai/Mixtral-8x7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [4]:
#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [5]:
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [6]:
#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config
)

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

### Testing the model

In [7]:
inputs_not_chat = tokenizer.encode_plus("[INST] Tell me about fantasy football? [/INST]", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat,
                               max_new_tokens=1000,
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [8]:
print(decoded[0])

<s> [INST] Tell me about fantasy football? [/INST] Fantasy football is a popular game in which players imagine themselves as general managers of a fictional football team, and compete against other teams in a virtual league. In fantasy football, players select real-life football players to be on their team, and use their performance data in the actual games to score points based on certain criteria, such as rushing yards, receiving yards, passing yards, and touchdowns. The team with the most points at the end of the season, or the most points in a specific week, wins. Fantasy football leagues can be competitive, as teams must use strategy and teamwork to pick the best players, and can involve a lot of research and analysis to find the best players. Additionally, many fantasy football games include special features such as trades, waiver claims, and rookie drafts, providing players with more flexibility to build their teams.</s>


## Creating a RAG

### Create vector database

In [9]:
from langchain.document_loaders import TextLoader

FILE_PATH = 'ANEXO_14_Vol._1-29-47.txt'
with open(FILE_PATH, 'r+', encoding="utf-8") as file:
    # Step 2: Read the contents
    doc = file.read()

loader = TextLoader(FILE_PATH, encoding="utf-8")
documents = loader.load()

# Chunk text
text_splitter = CharacterTextSplitter(chunk_size=100,
                                      chunk_overlap=0)

chunked_documents = text_splitter.split_documents(documents)

# Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents,
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))


# Connect query to FAISS index using a retriever
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4}
)



.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [10]:
query = 'What is the recommended longitudinal slope on a runway for code number 3?'
docs = db.similarity_search(query)
print(docs[1].page_content)

Recommendation.— The slope computed by dividing the difference between the maximum and minimum elevation along the runway centre line by the runway length should not exceed:
1 per cent where the code number is 3 or 4; and
2 per cent where the code number is 1 or 2.


## Create LLM Chain

In [11]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.0,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=300,
)

prompt_template = """
### [INST]
Instruction: As an aerospace engineer, you are tasked with assisting users in addressing inquiries related to airport design. Here is context to help:

{context}

### QUESTION:
{question}

[/INST]
 """

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

In [12]:
llm_chain.invoke({"context":"",
                  "question": "What is the recommended longitudinal slope on a runway for code number 3?"})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'context': '',
 'question': 'What is the recommended longitudinal slope on a runway for code number 3?',
 'text': "\nThe recommended longitudinal slope on a runway for code number 3 is typically between 2% and 5%. The specific slope will depend on various factors such as the type of aircraft that will be using the runway, the terrain of the area, and local regulations. It's important to consult with a qualified airport designer or engineer to determine the appropriate slope for your specific situation."}

### Create RAG chain

import streamlit as st

In [13]:
query = 'What is the recommended longitudinal slope on a runway for code number 3?'
retriever = db.as_retriever()

rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

result = rag_chain.invoke(query)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [14]:
result

{'context': [Document(page_content='1.25 per cent where the code number is 4, except that for the first and last quarter of the length of the runway the longitudinal slope should not exceed 0.8 per cent;', metadata={'source': 'ANEXO_14_Vol._1-29-47.txt'}),
  Document(page_content='Recommendation.— The slope computed by dividing the difference between the maximum and minimum elevation along the runway centre line by the runway length should not exceed:\n1 per cent where the code number is 3 or 4; and\n2 per cent where the code number is 1 or 2.', metadata={'source': 'ANEXO_14_Vol._1-29-47.txt'}),
  Document(page_content='1.5 per cent where the code number is 3, except that for the first and last quarter of the length of a precision approach runway category II or III the longitudinal slope should not exceed 0.8 per cent; and\n2 per cent where the code number is 1 or 2.', metadata={'source': 'ANEXO_14_Vol._1-29-47.txt'}),
  Document(page_content='Recommendation.— Along no portion of a run

In [15]:
result['context']

[Document(page_content='1.25 per cent where the code number is 4, except that for the first and last quarter of the length of the runway the longitudinal slope should not exceed 0.8 per cent;', metadata={'source': 'ANEXO_14_Vol._1-29-47.txt'}),
 Document(page_content='Recommendation.— The slope computed by dividing the difference between the maximum and minimum elevation along the runway centre line by the runway length should not exceed:\n1 per cent where the code number is 3 or 4; and\n2 per cent where the code number is 1 or 2.', metadata={'source': 'ANEXO_14_Vol._1-29-47.txt'}),
 Document(page_content='1.5 per cent where the code number is 3, except that for the first and last quarter of the length of a precision approach runway category II or III the longitudinal slope should not exceed 0.8 per cent; and\n2 per cent where the code number is 1 or 2.', metadata={'source': 'ANEXO_14_Vol._1-29-47.txt'}),
 Document(page_content='Recommendation.— Along no portion of a runway should the 

In [16]:
result['question']

'What is the recommended longitudinal slope on a runway for code number 3?'

In [17]:
result['text']

'\nThe recommended longitudinal slope on a runway for code number 3 is 1%.'

In [18]:
rag_chain.invoke('What are the recommendations for the radio altimeter operating area?')['text']

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


' The recommendations for the radio altimeter operating area are as follows:\n\n  - A radio altimeter operating area should extend before the threshold for a distance of at least 300 m.\n  - A radio altimeter operating area should be established in the pre-threshold area of a precision approach runway.\n\n  These recommendations can be found in ANEXO_14_Vol._1-29-47.txt. It is important to note that guidance on radio altimeter operating area is also provided in Attachment A, Section 4.3 and in the Manual of All-Weather Operations, (Doc 9365), Section 5.2. Additionally, guidance on the use of radio altimeter is given in the PANS-OPS, Volume II, Part III, Chapter 21.'