### **Setup**

In [39]:
! pip install --upgrade --quiet llama-index llama-index-llms-gemini llama-index-embeddings-gemini llama-index-vector-stores-pinecone pinecone-client

In [40]:
! wget https://huggingface.co/spaces/rasyosef/RAG-with-Phi-2-and-LangChain/raw/main/Oppenheimer-movie-wiki.txt -P ./data

--2024-06-01 00:49:01--  https://huggingface.co/spaces/rasyosef/RAG-with-Phi-2-and-LangChain/raw/main/Oppenheimer-movie-wiki.txt
Resolving huggingface.co (huggingface.co)... 18.238.49.117, 18.238.49.70, 18.238.49.10, ...
Connecting to huggingface.co (huggingface.co)|18.238.49.117|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51987 (51K) [text/plain]
Saving to: ‘./data/Oppenheimer-movie-wiki.txt.1’


2024-06-01 00:49:01 (679 KB/s) - ‘./data/Oppenheimer-movie-wiki.txt.1’ saved [51987/51987]



In [41]:
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get("GEMINI_API_KEY")
os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")

### **Load Data**

In [42]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir="./data")
documents = reader.load_data()

print("Number of Documents:", len(documents))

Number of Documents: 2


In [43]:
from llama_index.core.node_parser import SentenceSplitter

node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=32)
nodes = node_parser.get_nodes_from_documents(documents)

print("Number of Nodes:", len(nodes))

Number of Nodes: 50


### **Embeddings**

In [44]:
from llama_index.embeddings.gemini import GeminiEmbedding

embed_model = GeminiEmbedding(
    model="models/embedding-004",
    title="wikipedia page of the 'Oppenheimer' movie",
    embed_batch_size=16
)

### **Pinecone Vector Store**

In [45]:
from pinecone import Pinecone, ServerlessSpec

pinecone = Pinecone()
pinecone

<pinecone.control.pinecone.Pinecone at 0x7ca554db3f70>

In [46]:
INDEX_NAME = "rag"

if INDEX_NAME in pinecone.list_indexes().names():
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(
    name=INDEX_NAME,
    dimension=768,
    metric="dotproduct",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [47]:
pinecone_index = pinecone.Index(INDEX_NAME)

In [48]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.indices import VectorStoreIndex
from llama_index.core import StorageContext

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [49]:
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    embed_model=embed_model
)

Upserted vectors:   0%|          | 0/50 [00:00<?, ?it/s]

### **Query Engine**

In [50]:
from google.generativeai.types import HarmCategory, HarmBlockThreshold

#BLOCK_ONLY_HIGH
safety_settings={
  HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
  HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
  HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
  HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

In [51]:
from llama_index.llms.gemini import Gemini

llm = Gemini(
    model_name="models/gemini-pro",
    temperature=0,
    max_tokens=256,
    safety_settings=safety_settings
)

In [52]:
query_engine = index.as_query_engine(
    llm=llm,
    similarity_top_k=3,
)

In [53]:
response = query_engine.query("What's the name of the actor that played Lewis Strauss?")
response.response

'Robert Downey Jr.'

In [54]:
response = query_engine.query("Who is Boris Pash?")
response.response

'Boris Pash is a U.S. Army military intelligence officer and commander of the Alsos Mission.'

In [55]:
response = query_engine.query("How much money did the Oppenheimer movie make at the US and global box office?")
response.response

'In the United States and Canada, Oppenheimer grossed $326.4 million, while in other territories, it grossed $628.9 million, for a worldwide total of $955.3 million.'

In [56]:
response = query_engine.query("What score did the Oppenheimer movie get on Rotten Tomatoes and Metacritic?")
response.response

'On Rotten Tomatoes, Oppenheimer received a score of 8.6/10 based on 495 critic reviews, with 93% of the reviews being positive. On Metacritic, the film received a score of 89 out of 100 based on 69 critic reviews, indicating "universal acclaim".'

In [57]:
response = query_engine.query("In the plot of the Oppenheimer movie, why did Lewis Strauss have a grudge against J. Robert Oppenheimer?")
response.response

'Strauss resented Oppenheimer for having publicly humiliated him by dismissing his concerns about exporting radioisotopes and for recommending negotiations with the Soviet Union after they successfully detonated their own bomb. He also believes that Oppenheimer denigrated him during a conversation Oppenheimer had with Einstein in 1947.'

In [58]:
response = query_engine.query("What happened while Oppenheimer was a student at the University of Cambridge?")
response.response

'While studying at the University of Cambridge, Oppenheimer grappled with anxiety and homesickness. He left a poisoned apple for his professor, Patrick Blackett, but later retrieved it. Visiting scientist Niels Bohr recommended that Oppenheimer study theoretical physics at the University of Göttingen instead.'

In [59]:
response = query_engine.query("Which character did Matthias Schweighöfer play in the movie?")
response.response

'The provided context does not mention which character Matthias Schweighöfer played in the movie.'

In [60]:
response = query_engine.query("Which character did Casey Affleck portray in the movie?")
response.response

'The provided context does not mention Casey Affleck or the character he portrayed in the movie.'

In [61]:
sources = response.source_nodes
for node in sources:
  print(node.text)
  print("\n\n------------------------------------------\n\n")

len(response.source_nodes)

"[200]

Despite praising the film's themes and performances, CNN's Brian Lowry believed that "Nolan juggles a lot, in a way that somewhat works to the movie's detriment".[201] While praising how the film acknowledges the contribution of "American scientists and American enterprise", Brett Mason complained it omits the crucial contributions of non-Americans that ensured the work was able to commence as early as December 1941: "Nolan completely ignores the crucial role that British science and Australian physicist Mark Oliphant played in jump-starting the quest."[202] Writing for the Los Angeles Times, Justin Chang staunchly defended Nolan's accurate depiction of how Oppenheimer could not see the true victims of his work. Chang wrote that instead of satisfying "representational completists" by detouring to Hiroshima and Nagasaki, "Nolan treats them instead as a profound absence, an indictment by silence".[177]

For IndieWire's annual critics poll, of which 158 critics and journalists fro

3

### **Reranking with ColBERT**

In [None]:
! pip install --quiet llama-index-postprocessor-colbert-rerank

In [None]:
from llama_index.postprocessor.colbert_rerank import ColbertRerank

colbert_reranker = ColbertRerank(
    top_n=3,
    model="colbert-ir/colbertv2.0",
    tokenizer="colbert-ir/colbertv2.0",
    keep_retrieval_score=True
)

In [None]:
reranked_query_engine = index.as_query_engine(
    llm=llm,
    similarity_top_k=16,
    node_postprocessors=[colbert_reranker],
)

In [None]:
response = reranked_query_engine.query("Which character did Matthias Schweighöfer play in the movie?")
response.response

'Werner Heisenberg'

In [None]:
response = reranked_query_engine.query("Which character did Casey Affleck portray in the movie?")
response.response

'Boris Pash'

In [None]:
sources = response.source_nodes
for node in sources:
  print(node.text)
  print("\n\n------------------------------------------\n\n")

len(response.source_nodes)

Oppenheimer is a 2023 epic biographical thriller film written and directed by Christopher Nolan. It stars Cillian Murphy as J. Robert Oppenheimer, the American theoretical physicist credited with being the "father of the atomic bomb" for his role in the Manhattan Project—the World War II undertaking that developed the first nuclear weapons. Based on the 2005 biography American Prometheus by Kai Bird and Martin J. Sherwin, the film chronicles the career of Oppenheimer, with the story predominantly focusing on his studies, his direction of the Manhattan Project during World War II, and his eventual fall from grace due to his 1954 security hearing. The film also stars Emily Blunt as Oppenheimer's wife "Kitty", Matt Damon as head of the Manhattan Project Leslie Groves, Robert Downey Jr. as United States Atomic Energy Commission member Lewis Strauss, and Florence Pugh as Oppenheimer's communist lover Jean Tatlock. The ensemble supporting cast includes Josh Hartnett, Casey Affleck, Rami Male

3