### **Setup**

In [None]:
! pip install --quiet llama-index pymongo llama-index-vector-stores-mongodb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m676.9/676.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m267.1/267.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.0/136.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [None]:
! wget https://huggingface.co/spaces/rasyosef/RAG-with-Phi-2-and-LangChain/raw/main/Oppenheimer-movie-wiki.txt -P ./data

--2024-04-10 00:36:29--  https://huggingface.co/spaces/rasyosef/RAG-with-Phi-2-and-LangChain/raw/main/Oppenheimer-movie-wiki.txt
Resolving huggingface.co (huggingface.co)... 18.164.174.17, 18.164.174.55, 18.164.174.23, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.17|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51987 (51K) [text/plain]
Saving to: ‘./data/Oppenheimer-movie-wiki.txt’


2024-04-10 00:36:29 (785 KB/s) - ‘./data/Oppenheimer-movie-wiki.txt’ saved [51987/51987]



In [None]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("openai-api-key")
MONGO_URI = userdata.get("MONGO_URI")

### **Load Data**

In [None]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(
    input_dir="./data"
  )
documents = reader.load_data()
print("Number of Documents:", len(documents))

Number of Documents: 1


In [None]:
from llama_index.core.node_parser import SentenceSplitter

node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=32)
nodes = node_parser.get_nodes_from_documents(documents)
print("Number of Nodes:", len(nodes))

Number of Nodes: 25


### **Embeddings**

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding(
    model="text-embedding-3-small",
    embed_batch_size=16,
)

### **MongoDB Atlas Vector Store**

In [None]:
import pymongo
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core.indices import VectorStoreIndex
from llama_index.core import StorageContext

mongodb_client = pymongo.MongoClient(MONGO_URI)

db = mongodb_client["oppenheimer"]
collection = db["oppenheimer_wiki_chunks"]

# Delete any existing records
collection.delete_many({})

DeleteResult({'n': 100, 'electionId': ObjectId('7fffffff0000000000000064'), 'opTime': {'ts': Timestamp(1712709399, 92), 't': 100}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1712709399, 101), 'signature': {'hash': b'O\x0fe\x7fv#\xb12-T%\x0b\xa7VK\xbfP\x88-\xdb', 'keyId': 7314014067373376376}}, 'operationTime': Timestamp(1712709399, 92)}, acknowledged=True)

In [None]:
store = MongoDBAtlasVectorSearch(
    mongodb_client=mongodb_client,
    db_name="oppenheimer",
    collection_name="oppenheimer_wiki_chunks",
    index_name="vector_index",
    embedding_key="embedding",
  )
storage_context = StorageContext.from_defaults(vector_store=store)

In [None]:
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    embed_model=embed_model
)

### **Query Engine**

In [None]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(
    model="gpt-3.5-turbo-0125",
    temperature=0,
    max_tokens=256
)

In [None]:
query_engine = index.as_query_engine(llm=llm, streaming=True, similarity_top_k=3)

In [None]:
streaming_response = query_engine.query("What's the name of the actor that played Lewis Strauss?")
streaming_response.print_response_stream()

Robert Downey Jr.

In [None]:
streaming_response = query_engine.query("Which character did Matthias Schweighöfer play in the movie?")
streaming_response.print_response_stream()

Matthias Schweighöfer played the character Werner Heisenberg in the movie.

In [None]:
streaming_response = query_engine.query("Who is Boris Pash?")
streaming_response.print_response_stream()

Boris Pash is a U.S. Army military intelligence officer and commander of the Alsos Mission.

In [None]:
streaming_response = query_engine.query("Which character did Casey Affleck play in the movie?")
streaming_response.print_response_stream()

Boris Pash

In [None]:
streaming_response = query_engine.query("How much money did the Oppenheimer movie make at the US and global box office?")
streaming_response.print_response_stream()

Oppenheimer made $326.4 million in the United States and Canada, and $628.9 million in other territories, resulting in a worldwide total of $955.3 million.

In [None]:
streaming_response = query_engine.query("What score did the Oppenheimer movie get on Rotten Tomatoes and Metacritic?")
streaming_response.print_response_stream()

The Oppenheimer movie received a score of 93% on Rotten Tomatoes and a score of 89 out of 100 on Metacritic.

In [None]:
streaming_response = query_engine.query("Why did Lewis Strauss have a grudge against J. Robert Oppenheimer?")
streaming_response.print_response_stream()

Lewis Strauss had a grudge against J. Robert Oppenheimer because Oppenheimer publicly humiliated him by dismissing his concerns about exporting radioisotopes and recommending negotiations with the Soviet Union after they successfully detonated their own bomb. Additionally, Strauss believed that Oppenheimer denigrated him during a conversation Oppenheimer had with Einstein in 1947.

In [None]:
streaming_response = query_engine.query("What happened while Oppenheimer was a student at the University of Cambridge?")
streaming_response.print_response_stream()

While Oppenheimer was a student at the University of Cambridge, he grappled with anxiety and homesickness while studying under experimental physicist Patrick Blackett. Upset with Blackett's attitude, Oppenheimer left him a poisoned apple but later retrieved it. Niels Bohr recommended that Oppenheimer study theoretical physics at the University of Göttingen.

In [None]:
sources = streaming_response.source_nodes
for node in sources:
  print(node.text)
  print("\n\n------------------------------------------\n\n")

len(streaming_response.source_nodes)

Dane DeHaan as Maj Gen. Kenneth Nichols, a U.S. Army officer and the deputy district engineer of the Manhattan Project.
Alden Ehrenreich as a Senate aide to Lewis Strauss, a fictional character who is an aide during Strauss's nomination for United States Secretary of Commerce.
Tony Goldwyn as Gordon Gray, a government official and chairman of the committee deciding the revoking of Oppenheimer security clearance.
Jefferson Hall as Haakon Chevalier, a Berkeley professor who became friends with Oppenheimer at university.
David Krumholtz as Isidor Isaac Rabi, a Nobel Prize-winning physicist who worked as a consultant on the Manhattan Project.
Matthew Modine as Vannevar Bush, head of the Office of Scientific Research and Development.
Scott Grimes as Counsel to Lewis Strauss.
Kurt Koehler as Thomas A. Morgan, an industrialist and former chairman of the board of the Sperry Corporation who was one of the panel members at Oppenheimer's security clearance hearing.
John Gowans as Ward V. Evans, a

3

### **Vector Search**

In [None]:
def query_atlas(query):
  query_embedding = embed_model.get_query_embedding(query)
  pipeline = [
    {
      "$vectorSearch": {
        "index": "vector_index",
        "path": "embedding",
        "queryVector": query_embedding,
        "numCandidates": 128,
        "limit": 3 # Return top 3 matches
      }
    },
    {
        "$project": {
            "_id": 0,
            "text": 1,
            "score": {"$meta": "vectorSearchScore"}
        }
    }
  ]

  results = collection.aggregate(pipeline)
  return list(results)

In [None]:
query_atlas("Who is Boris Pash?")

[{'text': 'In 1963, President Lyndon B. Johnson presents Oppenheimer with the Enrico Fermi Award as a gesture of political rehabilitation. A flashback reveals Oppenheimer and Einstein\'s 1947 conversation never mentioned Strauss. Oppenheimer instead expressed his belief that they had indeed started a chain reaction—a nuclear arms race—that would one day destroy the world.\n\nCast\nCillian Murphy as J. Robert Oppenheimer, a theoretical physicist and director of the Los Alamos National Laboratory.\nEmily Blunt as Katherine "Kitty" Oppenheimer, Robert Oppenheimer\'s wife and a former Communist Party USA member.\nMatt Damon as Gen. Leslie Groves, a United States Army Corps of Engineers (USACE) officer and director of the Manhattan Project.\nRobert Downey Jr. as Rear Admiral Lewis Strauss, a retired Naval officer and high-ranking member of the U.S. Atomic Energy Commission (AEC).\nFlorence Pugh as Jean Tatlock, a psychiatrist, Communist Party USA member, and Robert Oppenheimer\'s romantic i