# Introduction

Retrieval Augmented Generation (RAG) is a pattern that leverages pretrained Large Language Models (LLM) and your own data to generate responses.


# Imports

In [1]:
# install required libraries

! pip install --quiet faiss-cpu ipywidgets langchain sentence_transformers pypdf

In [2]:
from typing import Any, Dict, Iterable, List, Optional

import langchain
import requests
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer

In [3]:
API_KEY = "xxx"
PROJECT_ID = "72890882-3979-4af3-9f58-1dc597899546"

# RAG Steps

In [7]:
! wget -qq -O ED-e-KYC-2023.pdf https://raw.githubusercontent.com/randyphoa/watsonx/main/data/ED-e-KYC-2023.pdf
! wget -qq -O PD-RMiT-June2023.pdf https://raw.githubusercontent.com/randyphoa/watsonx/main/data/PD-RMiT-June2023.pdf
! wget -qq -O Happy-Hunt-T-Cs-Final-1.pdf https://raw.githubusercontent.com/randyphoa/watsonx/main/data/Happy-Hunt-T-Cs-Final-1.pdf

## 1. Chunking

Documents are split into chunks to overcome tokens limitation.

In [8]:
loader = PyPDFLoader("Happy-Hunt-T-Cs-Final-1.pdf")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
texts = text_splitter.split_documents(loader.load())

texts

[Document(page_content='UEM  Sunrise  ‘Happy  Hunt’ Campaign  \n \n1. Introduction  \n1.1. These terms and conditions (“ Terms and Conditions ”) shall govern the UEM \nSunrise ‘Happy Hunt’ Campaign (“Campaign ”) organised by UEM Sunrise Berhad \n(“Organiser ”). \n \n1.2. The Organiser reserves the right at any time to  change, amend or delete the Terms \nand Conditions and other rules and regulations including changing of the mechanism \nfor the Campaign at its sole discretion.  \n \n1.3. The Organiser may terminate  or suspend the Campaign at any  time at its absolute \ndiscretio n, in which case,  the Organiser may elect not to award any giveaway(s). Such \ntermination or suspension will not give rise to any claim by the participants. If the \nCampaign is resumed by the Organiser, the participants shall abide by the Organiser’s \ndecisio n regarding the resumption of the Campaign and disposition of the giveaways. \nThe Campaign will be held during the Campaign Period as defined below

## 2. Tokenization and embeddings

In [9]:
class MiniLML6V2EmbeddingFunctionLangchain(langchain.embeddings.openai.Embeddings):
    MODEL = SentenceTransformer("all-MiniLM-L6-v2")

    def embed_documents(self, texts):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode(texts).tolist()

    def embed_query(self, query):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode([query]).tolist()[0]

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Texts are converted into embeddings (word vector space) that captures language semantics.

In [10]:
embedding = MiniLML6V2EmbeddingFunctionLangchain()
embedding.embed_query("Hello how are you?")

[0.0028318187687546015,
 0.03901984915137291,
 0.08902653306722641,
 0.07300358265638351,
 -0.02613072283565998,
 -0.0733506977558136,
 0.05532264709472656,
 -0.010837370529770851,
 -0.08899176865816116,
 0.017466692253947258,
 -0.0024859572295099497,
 -0.005507407709956169,
 -0.025227883830666542,
 -0.02106671966612339,
 0.07685411721467972,
 -0.034460268914699554,
 0.09116453677415848,
 -0.09042548388242722,
 -0.1149013340473175,
 0.04448229447007179,
 -0.0616748109459877,
 0.028660694137215614,
 0.03157806769013405,
 0.06953569501638412,
 -0.05539025366306305,
 -0.05994395166635513,
 0.030942391604185104,
 0.031256284564733505,
 0.037836797535419464,
 -0.08306162059307098,
 -0.05675555020570755,
 0.06849905848503113,
 -0.014742941595613956,
 -0.003990241792052984,
 -0.025798775255680084,
 0.05499803274869919,
 -0.02753622829914093,
 -0.11885888874530792,
 0.0049217501655220985,
 -0.014162867330014706,
 0.010688817128539085,
 -0.05209549516439438,
 -0.018154142424464226,
 -0.03590457

In [11]:
db = FAISS.from_documents(texts, MiniLML6V2EmbeddingFunctionLangchain())

In [12]:
db.similarity_search("What is the campaign period?")

[Document(page_content='1.4. For the avoidance  of doubt,  in the event of the termination, suspension  or \ncancellation  of the Campaign,  the participants  or the  winners  shall not be entitled to \nclaim compensation from the Organiser for any losses or damages suffered or incurred \nby the participants or the winners as a direct or indirect result of the act of termination, \nsuspension or cancellation.  \n \n1.5. The Campaign consists of:  \na. Happy Hunt Grand Lucky Draw; and  \nb. Happy Hunt Monthly Giveaway.  \n \n \n2. Campaign Period  \n2.1. The Campaign will  be held from 1 June  2023  to 31 December 2023.   \n \n2.2. The Organiser reserves the right to vary, postpone or re-schedule the dates of the \nCampaign or extend the Campaign Period at its sole discretion.  \n \n3. Eligibility  \n3.1 The Campaign  is open  to: \na. New registrants of UEM Sunrise’s properties;  \nb. New purchasers of UEM Sunrise’s properties;  \nc. Aged 18 and  above; and  \nd. Followe rs of UEM Sunr

## 3. Prompting

In [13]:
def make_prompt(context, question):
    return (
        f"Answer the question using the context provided."
        + f"Context:\n\n"
        + f"{context}:\n\n"
        + f'If the question is unanswerable, say "unanswerable".'
        + f"Question: {question}"
    )



In [14]:
question = "What is the duration of the Campaign?"
docs = db.similarity_search(question, k=5)
context = " ".join([doc.page_content for doc in docs])
prompt = make_prompt(context, question)

## 4. Generation

In [15]:
def access_token():
    url = "https://iam.cloud.ibm.com/identity/token"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    data = f"apikey={API_KEY}&grant_type=urn:ibm:params:oauth:grant-type:apikey"
    response = requests.post(url, headers=headers, data=data)
    iam_token = response.json()["access_token"]
    return iam_token

# access_token()

In [16]:
url = "https://us-south.ml.cloud.ibm.com/ml/v1-beta/generation/text?version=2023-05-29"
headers = {
    "Content-Type": "application/json",
    "Accept": "application/json",
    "Authorization": f"Bearer {access_token()}",
}
payload = {
    "model_id": "google/flan-ul2",
    "input": prompt,
    "parameters": {
        "decoding_method": "sample",
        "max_new_tokens": 100,
        "min_new_tokens": 1,
        "random_seed": 12345,
        "stop_sequences": [],
        "temperature": 0.0,
        "top_k": 50,
        "top_p": 1,
        "repetition_penalty": 1,
    },
    "project_id": PROJECT_ID,
}

response = requests.post(url, headers=headers, json=payload)
response.json()["results"][0]["generated_text"]

'from 1 June 2023 to 31 December 2023'