# Rag V2 Framework

In [None]:
from RAG.RAG import RAG
rag = RAG(
    data_models=[
                #  {"name":"travel",
                #   "type":"db",
                #   "data": ["flights", "accommodations", "activities", "services", "insurance"],
                #   "LLM_model":"gemini-pro-ultra",
                #   "embedding_model":"models/embedding-001"},

                  {"name":"islam",
                   "type":"db",
                   "data": ["hadith", "quran"],
                   "LLM_model":"gemini-pro",
                   "embedding_model":"models/embedding-001"},
                ],
        ) 

result = rag.query(
    # query_text="plan me a travel package: i want replicate the experience of Moses travelling from Egypt to sinai, with a side quest with Al-khidr next month",
    query_text="Tell me the narration on Al-Khidr and Musa (Moses) in the Hadith that supplements the Quran",
    data_sources=["islam"], #["islam", "travel", "finance"]
    instruction_mapper = {
                            "models": [ 
                                {"type": "NLP", "model": "spaCy"},           # favor rule-based extraction for efficiency.
                                # {"type": "LLM", "model": "gemini-pro-ultra"} # rely on the LLM to interpret subtleties.
                            ] # Complex Query: The list format above would enable a sequential hybrid approach leveraging both techniques.
                         } 
)

print(result)


## Ingestion

No Hadith found containing 'Al-Khidr'.


No Hadiths found in Hisn al-Muslim section.


In [7]:
from openai import api_key, Completion  # Assuming you're using the OpenAI API
def construct_prompt(instruction, text_chunk):
    return f"Instruction: {instruction} \nDocument Text: {text_chunk} \nAnswer:"

def RagParse(parsing_instruction, result_type="markdown",):
    """
    RagParse function primarily leveraging an LLM for parsing instruction execution. 
    """
    # ... (Setup your API Key if needed) ...

    def load_data(data_path):
        try:
            with open(data_path, "r") as f:
                document_text = f.read() 
            return {"document_text": document_text}  
        except FileNotFoundError:
            print(f"Error: File not found at {data_path}")
            return {}

    def process_document(document):
        benefits_strings = []

        # (1) Split document into potential sections or chunks 
        document_chunks = split_document(document["document_text"])  

        for chunk in document_chunks:
            # (2) LLM Prompt Construction for Each Chunk
            prompt = construct_prompt(parsing_instruction, chunk) 

            # (3) Call Gemini API
            response = Completion.create(
                engine="gemini-pro-ultra-0225",  # Or your preferred engine
                prompt=prompt,
                max_tokens=100,  # Adjust as needed
                # ... other API parameters as needed ...
            )

            # (4) Extract and Process Response 
            extracted_strings = extract_benefits_strings(response.choices[0].text)
            benefits_strings.extend(extracted_strings)

        return format_output(benefits_strings, result_type)

    return process_document(load_data("./policy.pdf")) # Change the path as needed


[]

# GCP AI Platform 
Notebooks are pre-installed with the Rag V2 framework. This notebook demonstrates how to use the Rag V2 framework to fine-tune a Rag model on a custom dataset that is stored in a Google Cloud Storage bucket.
- The dataset should be in the form of a jsonl file with the following format:
```
{"text": "question", "meta": {"name": "document_name", "section": "document_section"}}
{"text": "answer", "meta": {"name": "document_name", "section": "document_section"}}
```

- The model is fine-tuned on the dataset using the `rag_v2` framework.
- The fine-tuned model is saved to a Google Cloud Storage bucket.


# Steps
1. Load the dataset from a Google Cloud Storage bucket.
2. Fine-tune the Rag model on the dataset.
3. Save the fine-tuned model to a Google Cloud Storage bucket.
4. Test the fine-tuned model.
5. Inference using the fine-tuned model + database of documents.

    a) Load the documents from a Google Cloud Storage bucket.

    b) Index the documents using embedding-model (LLM/NLP)

        i) Generate embeddings for the documents.

        ii) Index the embeddings

    c) Query the fine-tuned model with a question.

        i) Retrieve the top-k documents.

        ii) Retrieve the answer from the indexed documents.

    d) Retrieve the answer from the indexed documents.
    

## 1) GCP API
Test

In [1]:
import gspread

gc = gspread.service_account(filename='./gdrive/phrasal-ability-419201-d527372ace3b.json') 
# gc = gspread.service_account(filename='./gdrive/lunar-landing-389714-369d3f1b2a09.json')
sheets = gc.openall()
print([sheet.title for sheet in sheets]) 

['Master Database']


In [5]:
from gdrive.gdrive_handler import GspreadHandler
gspread_handler = GspreadHandler(credentials_filepath='./gdrive/phrasal-ability-419201-d527372ace3b.json')
df = gspread_handler.get_sheet_as_df(sheet_name="Master Database", worksheet_name="partners")


In [6]:
df

Unnamed: 0,meta,data
0,Wang Gunung Trail Run 2.0,WANG GUNUNG\nTRAIL RUN 2.0\nCONQUER THE AGRO ...
1,Nakawan Ultra,NAKAWAUN ULTRA\nTHE THRILL OF NORTHERN TRAILS...
2,Perlis Eco X Venture,PERLIS\nECO X VENTURE \nLETS TRAVEL WITH OUR ...
3,Perlis EcoXVenture Experiences,Visit Perlis 2024 - 2025\nPerlis Indahnya Mem...
4,ClubRock Perlis 3D2N,MARK HISHAM\nGENERAL MANAGER\n\nm +6012 899 0...
5,Perlis Smart Leisure Travels,Penang MATTA Fair Special\n30.09 to 01.10.23\...
6,Perlis Eco Adventure Tours,**unicastanaholidays**\ns/bnd (KPK/LN: 1031)\...
7,Perlis Nature & Homestay Packages,PERLIS NATURE & HOMESTAY PACKAGES\n\n**Perlis...
8,Smart Leisure Travels Jom Discover Perlis,Explore \nPerlis\n<center> _Inilah Masanya_\n...
9,2D1N Harumanis Tour,MATTA FAIR\n\nSMART LEISURE TRAVELS\nSEE TRAV...
