In [84]:
!pip install llama-index openai python-dotenv
!pip install -r mongodb-demo/requirements.txt

In [206]:
import os
import time
import openai
from llama_index import (
    VectorStoreIndex,
    SummaryIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
    ServiceContext,
)
from llama_index.schema import IndexNode
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.llms import OpenAI
from dotenv import load_dotenv
from tqdm.auto import tqdm

In [194]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.getenv('OPENAI_API_KEY')

## Load and store data

In [195]:
wiki_titles = ["Microsoft", "Google"]

In [198]:
from pathlib import Path

import requests

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            # 'exintro': True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)

In [199]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]
    ).load_data()

In [200]:
import json

mongodb_company_text = []
for company in city_docs:
    text = city_docs[company][0].text
    mongodb_company_text.append({"full_text": text})
    
with open('mongodb_company.json', 'w') as file:
    json.dump(mongodb_company_text, file)

In [201]:
json_file = 'mongodb-demo/mongodb_company.json'
import os
import json
import pymongo
# from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

# Load the tweets from a local file
with open(json_file, 'r') as f:
    company_data = json.load(f)

# Create a new client and connect to the server
client = pymongo.MongoClient("mongodb+srv://test:1ay1dx6Pi3QF8rKh@cluster0.5ozbqdb.mongodb.net/?retryWrites=true&w=majority")
#client = MongoClient(os.getenv('MONGODB_URI'), server_api=ServerApi('1'))
db = client[os.getenv("MONGODB_DATABASE")]
collection = db[os.getenv("MONGODB_COLLECTION")]

# collection.insert_one(company_data[0])
# Insert the tweets into mongo
# collection.insert_many(company_data)

In [202]:
from llama_index.readers.mongo import SimpleMongoReader
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

In [154]:
query_dict = {}
reader = SimpleMongoReader(uri="mongodb+srv://test:1ay1dx6Pi3QF8rKh@cluster0.5ozbqdb.mongodb.net/?retryWrites=true&w=majority")
documents = reader.load_data("wiki_company_db", "wiki_company_collection",
    field_names=["full_text"],
    query_dict=query_dict
)

In [157]:
# Create a new client and connect to the server
client = MongoClient("mongodb+srv://test:1ay1dx6Pi3QF8rKh@cluster0.5ozbqdb.mongodb.net/?retryWrites=true&w=majority", server_api=ServerApi('1'))

# create Atlas as a vector store
store = MongoDBAtlasVectorSearch(
    client,
    db_name="wiki_company_db",
    collection_name="wiki_company_vectors",
    index_name="wiki_company_vector_index"
)

In [158]:
storage_context = StorageContext.from_defaults(vector_store=store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context,
    show_progress=True
)

Parsing documents into nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/16 [00:00<?, ?it/s]

In [159]:
query_engine = index.as_query_engine(similarity_top_k=3)
response = query_engine.query("Microsoft")
print(response)

Microsoft is an American multinational technology corporation that is headquartered in Redmond, Washington. It is known for its software products such as the Windows operating system, the Microsoft 365 suite of productivity applications, and the Edge web browser. Microsoft also produces hardware products like the Xbox video game consoles and the Microsoft Surface lineup of touchscreen personal computers. It is considered one of the Big Five American information technology companies and is ranked No. 14 in the 2022 Fortune 500 rankings. Microsoft was founded in 1975 by Bill Gates and Paul Allen and has since diversified its offerings and made several corporate acquisitions.


In [203]:
data_llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=data_llm)

In [207]:
from llama_index.agent import OpenAIAgent

# Build agents dictionary
agents = {}

for wiki_title, cd in tqdm(zip(wiki_titles, company_data)):
    
    client = pymongo.MongoClient("mongodb+srv://test:1ay1dx6Pi3QF8rKh@cluster0.5ozbqdb.mongodb.net/?retryWrites=true&w=majority")
    db = client[os.getenv("MONGODB_DATABASE")]
    collection = db[os.getenv("MONGODB_COLLECTION")]

    collection.insert_one(cd)
    time.sleep(30)
    c_index = "wiki_"+wiki_title+"_vector_index"

    store = MongoDBAtlasVectorSearch(
        client,
        db_name="wiki_company_db",
        collection_name="wiki_company_vectors",
        index_name=c_index
    )
    
    storage_context = StorageContext.from_defaults(vector_store=store)
    vector_index = VectorStoreIndex.from_documents(
            city_docs[wiki_title], storage_context=storage_context)
    
    time.sleep(180)
    # define query engines
    vector_query_engine = vector_index.as_query_engine()
    # define tools
    query_engine_tools = [
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name="vector_tool",
                description=(
                    "Useful for retrieving specific context from {wiki_title}"
                ),
            ),
        )
    ]

    # build agent
    function_llm = OpenAI(model="gpt-3.5-turbo-0613")
    agent = OpenAIAgent.from_tools(
        query_engine_tools,
        llm=function_llm,
        verbose=True,
    )

    agents[wiki_title] = agent
    collection.delete_many({})
    time.sleep(30)

0it [00:00, ?it/s]

In [208]:
# define top-level nodes
nodes = []
for wiki_title in wiki_titles:
    # define index node that links to these agents
    wiki_summary = (
        f"This content contains Wikipedia articles about {wiki_title}. Use"
        " this index if you need to lookup specific facts about"
        f" {wiki_title}.\nDo not use this index if you want to analyze"
        " multiple cities."
    )
    node = IndexNode(text=wiki_summary, index_id=wiki_title)
    nodes.append(node)

In [209]:
# define top-level retriever
vector_index = VectorStoreIndex(nodes)
vector_retriever = vector_index.as_retriever(similarity_top_k=10)

In [210]:
# Persist index to disk
vector_index.storage_context.persist("top_index")

In [211]:
# define recursive retriever
from llama_index.retrievers import RecursiveRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer

recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever},
    query_engine_dict=agents,
    verbose=True,
)

In [212]:
response_synthesizer = get_response_synthesizer(
    # service_context=service_context,
    response_mode="compact",
)
query_engine = RetrieverQueryEngine.from_args(
    recursive_retriever,
    response_synthesizer=response_synthesizer,
    service_context=service_context,
)

In [213]:
response = query_engine.query("Microsoft")

[1;3;34mRetrieving with query id None: Microsoft
[0m[1;3;38;5;200mRetrieved node with id, entering: Microsoft
[0m[1;3;34mRetrieving with query id Microsoft: Microsoft
[0m=== Calling Function ===
Calling function: vector_tool with args: {
  "input": "Microsoft"
}
Got output: Microsoft offers technical references for developers and articles for various Microsoft magazines through the Microsoft Developer Network (MSDN). They also have community sites like Channel 9 and On10.net that provide resources and forums for developers and users. Microsoft provides free technical support through online Usenet newsgroups. They also have a program called Microsoft Most Valuable Professional (MVP) that recognizes individuals with special social status and possibilities for awards and other benefits. Microsoft is known for its internal lexicon and uses phrases like "eating your own dog food" to describe the policy of using pre-release and beta versions of products. They are also an outspoken oppo

## Inference

In [214]:
from llama_index import StorageContext, load_index_from_storage
import os
import openai
from llama_index import (
    VectorStoreIndex,
    SummaryIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
    ServiceContext,
)
from llama_index.schema import IndexNode
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.llms import OpenAI
from dotenv import load_dotenv

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.getenv('OPENAI_API_KEY')

data_llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=data_llm)

In [215]:
wiki_titles = ["Microsoft", "Google"]

In [216]:
from llama_index.agent import OpenAIAgent

# Build agents dictionary
new_agents = {}

for wiki_title in wiki_titles:
    client = pymongo.MongoClient("mongodb+srv://test:1ay1dx6Pi3QF8rKh@cluster0.5ozbqdb.mongodb.net/?retryWrites=true&w=majority")
    db = client[os.getenv("MONGODB_DATABASE")]
    collection = db[os.getenv("MONGODB_COLLECTION")]
    # build vector index
#     storage_context = StorageContext.from_defaults(persist_dir=wiki_title)
#     new_vector_index = load_index_from_storage(storage_context)
    c_index = "wiki_"+wiki_title+"_vector_index"

    store = MongoDBAtlasVectorSearch(
        client,
        db_name="wiki_company_db",
        collection_name="wiki_company_vectors",
        index_name=c_index
    )
    
    storage_context = StorageContext.from_defaults(vector_store=store)
    new_vector_index = VectorStoreIndex([], storage_context=storage_context)
    # define query engines
    vector_query_engine = new_vector_index.as_query_engine()

    # define tools
    query_engine_tools = [
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name="vector_tool",
                description=(
                    "Useful for retrieving specific context from {wiki_title}"
                ),
            ),
        )
    ]

    # build agent
    function_llm = OpenAI(model="gpt-3.5-turbo-0613")
    agent = OpenAIAgent.from_tools(
        query_engine_tools,
        llm=function_llm,
        verbose=True,
    )

    new_agents[wiki_title] = agent

In [217]:
from llama_index import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="top_index")
new_index = load_index_from_storage(storage_context)
new_vector_retriever = new_index.as_retriever(similarity_top_k=10)

In [218]:
# define recursive retriever
from llama_index.retrievers import RecursiveRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer

new_recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": new_vector_retriever},
    query_engine_dict=agents,
    verbose=True,
)

In [219]:
response = query_engine.query("Environmental/sustainability effort of Microsoft")

[1;3;34mRetrieving with query id None: Environmental/sustainability effort of Microsoft
[0m[1;3;38;5;200mRetrieved node with id, entering: Microsoft
[0m[1;3;34mRetrieving with query id Microsoft: Environmental/sustainability effort of Microsoft
[0m[1;3;32mGot response: Microsoft has made significant efforts towards environmental sustainability. Here are some key initiatives and actions taken by Microsoft:

1. Carbon Neutrality: Microsoft has been carbon neutral since 2012, meaning they have achieved net-zero carbon emissions by reducing and offsetting their greenhouse gas emissions.

2. Renewable Energy: Microsoft is committed to powering their data centers and operations with renewable energy. They have set a goal to be 100% powered by renewable energy by 2025.

3. Water Conservation: Microsoft is focused on water conservation and has implemented various measures to reduce water consumption in their facilities. They also invest in water replenishment projects to restore water i

In [220]:
response.response

"Microsoft has made significant efforts towards environmental sustainability. They have achieved carbon neutrality since 2012 by reducing their own emissions and investing in renewable energy projects and carbon offset programs. Microsoft is committed to powering its operations with renewable energy and aims to be 100% powered by renewable energy by 2025. They also focus on water conservation, waste reduction, and sustainable packaging. Microsoft's AI for Earth program uses artificial intelligence to address environmental challenges, and they promote a circular economy by designing products for durability, repairability, and recyclability. The company publishes an annual Environmental Sustainability Report to provide transparency on their environmental performance. Microsoft continues to invest in research, innovation, and partnerships to drive positive environmental impact."

## Putting it all together

In [None]:
from llama_index import StorageContext, load_index_from_storage
import os
import openai
from llama_index import (
    VectorStoreIndex,
    SummaryIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
    ServiceContext,
)
from llama_index.schema import IndexNode
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.llms import OpenAI
from dotenv import load_dotenv
from llama_index.agent import OpenAIAgent
from llama_index import StorageContext, load_index_from_storage
# define recursive retriever
from llama_index.retrievers import RecursiveRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer

load_dotenv()

In [227]:
def data_layer(query="Environmental/sustainability effort of Microsoft"):

    # Build agents dictionary
    new_agents = {}
    wiki_titles = ["Microsoft", "Google"]
    for wiki_title in wiki_titles:
        client = pymongo.MongoClient("mongodb+srv://test:1ay1dx6Pi3QF8rKh@cluster0.5ozbqdb.mongodb.net/?retryWrites=true&w=majority")
        db = client["wiki_company_db"]
        collection = db["wiki_company_collection"]
        c_index = "wiki_"+wiki_title+"_vector_index"

        store = MongoDBAtlasVectorSearch(
            client,
            db_name="wiki_company_db",
            collection_name="wiki_company_vectors",
            index_name=c_index
        )

        storage_context = StorageContext.from_defaults(vector_store=store)
        new_vector_index = VectorStoreIndex([], storage_context=storage_context)
        vector_query_engine = new_vector_index.as_query_engine()
        query_engine_tools = [
            QueryEngineTool(
                query_engine=vector_query_engine,
                metadata=ToolMetadata(
                    name="vector_tool",
                    description=(
                        "Useful for retrieving specific context from {wiki_title}"
                    ),
                ),
            )
        ]

        # build agent
        function_llm = OpenAI(model="gpt-3.5-turbo-0613")
        agent = OpenAIAgent.from_tools(
            query_engine_tools,
            llm=function_llm,
            verbose=True,
        )

        new_agents[wiki_title] = agent
    
    storage_context = StorageContext.from_defaults(persist_dir="top_index")
    new_index = load_index_from_storage(storage_context)
    new_vector_retriever = new_index.as_retriever(similarity_top_k=10)
    
    new_recursive_retriever = RecursiveRetriever(
        "vector",
        retriever_dict={"vector": new_vector_retriever},
        query_engine_dict=agents,
        verbose=True,
    )
    
    response = query_engine.query(query)
    return response.response

In [228]:
data_layer()

[1;3;34mRetrieving with query id None: Environmental/sustainability effort of Microsoft
[0m[1;3;38;5;200mRetrieved node with id, entering: Microsoft
[0m[1;3;34mRetrieving with query id Microsoft: Environmental/sustainability effort of Microsoft
[0m[1;3;32mGot response: Microsoft has made significant efforts towards environmental sustainability. Here are some key initiatives and actions taken by Microsoft:

1. Carbon Neutrality: Microsoft has been carbon neutral since 2012. They have achieved this by reducing their carbon emissions through energy efficiency measures, investing in renewable energy, and purchasing carbon offsets.

2. Renewable Energy: Microsoft is committed to using 100% renewable energy for its operations. They have invested in renewable energy projects and have signed power purchase agreements (PPAs) to support the development of wind, solar, and hydroelectric projects.

3. Water Conservation: Microsoft is focused on water conservation and has implemented various

'Microsoft has made significant efforts towards environmental sustainability. They have achieved carbon neutrality since 2012 by reducing their carbon emissions through energy efficiency measures, investing in renewable energy, and purchasing carbon offsets. They are committed to using 100% renewable energy for their operations and have invested in renewable energy projects. Microsoft is focused on water conservation and has implemented measures to reduce water consumption in their data centers and facilities. They also aim to minimize waste generation and increase recycling through waste management programs. Microsoft is working towards using sustainable packaging materials and has launched the AI for Earth program, which uses artificial intelligence to address environmental challenges. They are committed to promoting a circular economy by designing products that are durable, repairable, and recyclable. Microsoft publishes an annual Environmental Sustainability Report to provide trans