### **Setup**

In [1]:
! pip install --upgrade --quiet llama-index llama-index-llms-gemini llama-index-embeddings-gemini llama-index-vector-stores-pinecone pinecone-client

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.5/215.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.4/137.4 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.9/215.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m598.7/598.7 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [2]:
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get("GEMINI_API_KEY")
os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")

### **Embeddings**

In [3]:
from llama_index.embeddings.gemini import GeminiEmbedding

embed_model = GeminiEmbedding(
    model="models/embedding-001",
    title="Oppenheimer movie wikipedia",
    embed_batch_size=16
)

### **Pinecone Vector Store**

In [4]:
from pinecone import Pinecone

pinecone = Pinecone()
pinecone

<pinecone.control.pinecone.Pinecone at 0x7e54550354b0>

In [5]:
INDEX_NAME = "rag"

pinecone_index = pinecone.Index(INDEX_NAME)

In [6]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.indices import VectorStoreIndex

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [7]:
from llama_index.core import VectorStoreIndex

index_loaded = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=embed_model
)

### **Query Engine**

In [8]:
from google.generativeai.types import HarmCategory, HarmBlockThreshold

#BLOCK_ONLY_HIGH
safety_settings={
  HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
  HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
  HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
  HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

In [9]:
from llama_index.llms.gemini import Gemini

llm = Gemini(
    model_name="models/gemini-pro",
    temperature=0,
    max_tokens=256,
    safety_settings=safety_settings
)

In [10]:
query_engine = index_loaded.as_query_engine(
    llm=llm,
    similarity_top_k=3,
)

In [11]:
response = query_engine.query("What's the name of the actor that played Lewis Strauss?")
response.response

'Robert Downey Jr.'

### **Gradio Demo**

In [12]:
! pip install --quiet gradio

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.9/315.9 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.5/142.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.2/47.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.9/129.9 kB[0m [31m12.8 MB

In [15]:
import gradio as gr

# Generates response using the question answering chain defined earlier
def generate(query):

  response_str = ""
  response = query_engine.query(query)
  return response.response
  # for token in response.response:
  #   response_str += token
  #   yield response_str

with gr.Blocks() as demo:
  gr.Markdown("""
  # Retrieval Augmented Generation with Gemini Pro and Pinecone: Question Answering demo
  ### This demo uses the Gemini Pro LLM and Pinecone Vector Search for fast and performant Retrieval Augmented Generation (RAG).
  ### The context is the new Oppenheimer movie's entire wikipedia page. The movie came out very recently in July, 2023, so the Gemini Pro model is not aware of it.
  Retrieval Augmented Generation (RAG) enables us to retrieve just the few small chunks of the document that are relevant to the our query and inject it into our prompt.
  The model is then able to answer questions by incorporating knowledge from the newly provided document. RAG can be used with thousands of documents, but this demo is limited to just one txt file.
  """)

  gr.Markdown("## Enter your question")
  with gr.Row():
    with gr.Column():
      ques = gr.Textbox(label="Question", placeholder="Enter text here", lines=2)
    with gr.Column():
      ans = gr.Textbox(label="Answer", lines=4, interactive=False)
  with gr.Row():
    with gr.Column():
      btn = gr.Button("Submit")
    with gr.Column():
      clear = gr.ClearButton([ques, ans])

  btn.click(fn=generate, inputs=[ques], outputs=[ans])
  examples = gr.Examples(
        examples=[
            "Who portrayed J. Robert Oppenheimer in the new Oppenheimer movie?",
            "In the plot of the movie, why did Lewis Strauss resent Robert Oppenheimer?",
            "What happened while Oppenheimer was a student at the University of Cambridge?",
            "How much money did the Oppenheimer movie make at the US and global box office?",
            "What score did the Oppenheimer movie get on Rotten Tomatoes and Metacritic?"
        ],
        inputs=[ques],
    )

demo.queue().launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://98a59190f15b904377.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://98a59190f15b904377.gradio.live


