# Lesson 2 - Retrieval Augmented Generation (RAG)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pwd
%cd /content/drive/MyDrive/Coursera_new/Building-Applications-with-Vector-Databases/02_Retrieval-Augmented-Generation/
!ls -al

/content
/content/drive/MyDrive/Coursera_new/Building-Applications-with-Vector-Databases/02_Retrieval-Augmented-Generation
total 57
-rw------- 1 root root   748 Mar  4 06:25 DLAIUtils.py
drwx------ 2 root root  4096 Mar  4 06:23 images
-rw------- 1 root root 52907 Mar  4 07:00 Lesson_2_Retrieval_Augmented_Generation.ipynb


![Project Description](./images/2_retrieval_augmented_generation.png)

In [3]:
# !pip install -r /content/drive/MyDrive/Coursera_new/Building-Applications-with-Vector-Databases/requirements.txt

In [4]:
!pip install python-dotenv
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install sentence-transformers
!pip install matplotlib
!pip install torch
!pip install langchain
!pip install openai
!pip install pinecone-client
!pip install pinecone-datasets
!pip install pinecone-text
!pip install tiktoken
!pip install tqdm
!pip install datasets
!pip install deepface

Collecting pinecone-text
  Using cached pinecone_text-0.8.0-py3-none-any.whl (23 kB)
Collecting mmh3<4.0.0,>=3.1.0 (from pinecone-text)
  Using cached mmh3-3.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38 kB)
Collecting types-requests<3.0.0,>=2.25.0 (from pinecone-text)
  Using cached types_requests-2.31.0.20240218-py3-none-any.whl (14 kB)
Collecting wget<4.0,>=3.2 (from pinecone-text)
  Using cached wget-3.2-py3-none-any.whl
Collecting urllib3<3,>=1.21.1 (from requests<3.0.0,>=2.25.0->pinecone-text)
  Using cached urllib3-2.2.1-py3-none-any.whl (121 kB)
Installing collected packages: wget, mmh3, urllib3, types-requests, pinecone-text
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.26.18
    Uninstalling urllib3-1.26.18:
      Successfully uninstalled urllib3-1.26.18
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the so

### Import  the Needed Packages

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
from datasets import load_dataset
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from DLAIUtils import Utils

import ast
import os
import pandas as pd

### Setup Pinecone

In [7]:
# get api key
utils = Utils()
# PINECONE_API_KEY = utils.get_pinecone_api_key()   # Commented

# Below has been added
from google.colab import userdata
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

# print(f"PINECONE_API_KEY: {PINECONE_API_KEY}")

In [8]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = utils.create_dlai_index_name('dl-ai')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME,
                      dimension=1536,
                      metric='cosine',
                      spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)

ForbiddenException: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'X-Cloud-Trace-Context': '9c0148567b55c9cdefde46a586209f50', 'Date': 'Mon, 04 Mar 2024 07:04:37 GMT', 'Server': 'Google Frontend', 'Content-Length': '136', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"FORBIDDEN","message":"Index creation failed. To create serverless indexes, you must upgrade your plan."},"status":403}


### Load the Dataset

**Note:** To access the dataset outside of this course, just copy the following two lines of code and run it (remember to uncomment them first before executing):

####!wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"

####!unzip lesson2-wiki.csv.zip

<p style="background-color:#fff1d7; padding:15px; "> <b>(Note: <code>max_articles_num = 500</code>):</b> To achieve a more comprehensive context for the Language Learning Model, a larger number of articles is generally more beneficial. In this lab, we've initially set <code>max_articles_num</code> to 500 for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, consider increasing this value to 750 or 1,000. You'll likely notice that the context provided to the LLM becomes richer and better. You can experiment by gradually raising this variable for different queries to observe the improvements in the LLM's contextual understanding.</p>

In [9]:
max_articles_num = 500

df = pd.read_csv('../data/wiki.csv', nrows=max_articles_num)
df.head()

Unnamed: 0,id,metadata,values
1,1-0,"{'chunk': 0, 'source': 'https://simple.wikiped...","[-0.011254455894231796, -0.01698738895356655, ..."
2,1-1,"{'chunk': 1, 'source': 'https://simple.wikiped...","[-0.0015197008615359664, -0.007858820259571075..."
3,1-2,"{'chunk': 2, 'source': 'https://simple.wikiped...","[-0.009930099360644817, -0.012211072258651257,..."
4,1-3,"{'chunk': 3, 'source': 'https://simple.wikiped...","[-0.011600767262279987, -0.012608098797500134,..."
5,1-4,"{'chunk': 4, 'source': 'https://simple.wikiped...","[-0.026462381705641747, -0.016362832859158516,..."


### Prepare the Embeddings and Upsert to Pinecone

In [10]:
prepped = []

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    meta = ast.literal_eval(row['metadata'])
    prepped.append({'id':row['id'],
                    'values':ast.literal_eval(row['values']),
                    'metadata':meta})
    if len(prepped) >= 250:
        index.upsert(prepped)
        prepped = []

  0%|          | 0/500 [00:00<?, ?it/s]

NameError: name 'index' is not defined

In [None]:
index.describe_index_stats()

### Connect to OpenAI

In [11]:
# OPENAI_API_KEY = utils.get_openai_api_key()   # Commented

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

# print(f"OPENAI_API_KEY: {OPENAI_API_KEY}")

openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [12]:
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

### Run Your Query

In [14]:
query = "what is the berlin wall?"
embed = get_embeddings([query])
print(f"embed: {embed}")

res = index.query(vector=embed.data[0].embedding,
                  top_k=3,
                  include_metadata=True)
text = [r['metadata']['text'] for r in res['matches']]
print('\n'.join(text))

embed: CreateEmbeddingResponse(data=[Embedding(embedding=[-0.0017607873305678368, -0.009881123900413513, -0.018201706930994987, -0.016130192205309868, -0.01189739815890789, -0.004433040972799063, -0.03518812730908394, -0.01212526485323906, -0.0045642368495464325, -0.046374306082725525, -0.002818986074998975, -0.00528236199170351, -0.0112759442999959, 0.015577789396047592, -0.006307761650532484, 0.008154862560331821, 0.030354592949151993, -0.00317804841324687, -0.015412067994475365, -0.009950174950063229, -0.015867801383137703, -0.012705288827419281, 0.019610337913036346, -0.01781502552330494, 0.0010090002324432135, -0.013299123384058475, 0.02578345127403736, -0.0369834378361702, -0.009446105919778347, -0.014334880746901035, -0.011628101579844952, 0.00015795297804288566, -0.03654151409864426, 0.01138642430305481, -0.008969658054411411, 0.001470775343477726, 0.0010728718480095267, -0.011973354034125805, 0.018671249970793724, 0.013388888910412788, 0.02927740477025509, 0.012988395988941193

NameError: name 'index' is not defined

### Build the Prompt

In [None]:
query = "write an article titled: what is the berlin wall?"
embed = get_embeddings([query])

res = index.query(vector=embed.data[0].embedding,
                  top_k=3,
                  include_metadata=True)

contexts = [
    x['metadata']['text'] for x in res['matches']
]

prompt_start = (
    "Answer the question based on the context below.\n\n"+
    "Context:\n"
)

prompt_end = (
    f"\n\nQuestion: {query}\nAnswer:"
)

prompt = (
    prompt_start + "\n\n---\n\n".join(contexts) +
    prompt_end
)

print(prompt)

### Get the Summary

In [None]:
res = openai_client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature=0,
    max_tokens=636,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
)
print('-' * 80)
print(res.choices[0].text)