In [41]:
import os
import openai
import time

os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter

### Loading Data

In [4]:
## Ingesting documents from Local


data_root = "./ritika_data/"

filenames = ["Decarbonizing-the-Built-World-A-Call-to-Action-2023-03-07",
             "Digital-Twin-Capabilities-Periodic-Table-User-Guide",
             "Digital-Twin-System-Interoperability-Framework-12072021",
             "DTC-Reality-Capture-Industry-User-Guide-for-Tenant-Improvement-Projects-2023-06-07",
             "Infrastructure-Digital-Twin-Maturity-Model",
             "Platform-Stack-Architectural-Framework",
             "Reality-Capture-A-Digital-Twin-Foundation",
             "SMM-Digital-Twin-Profile-2022-06-20",
             "User-Guide-1-Why-and-What-2023-07-18",
             "User-Guide-2-Identifying-and-Aligning",
             "User-Guide-3_A-Whole_Systems_Approach",
             "CV_Ritika Sehgal_14th Feb"]

docs = []

for filename in filenames:
    doc = SimpleDirectoryReader(input_files=[f"{data_root}/{filename}.pdf"]).load_data()
    doc[0].doc_id = filename.replace(".pdf","")
    docs.extend(doc)

### Building Document Summary index

In [5]:
from llama_index.core.indices.service_context import ServiceContext

In [6]:
llm = OpenAI(model="gpt-3.5-turbo-0125", temperature=0)
text_splitter = SentenceSplitter(chunk_size=3000)
embed_model = OpenAI(model="text-embedding-3-large")

In [7]:
## Ingesting documents; Generating summaries, creating embeddings of document nodes
start_time = time.time()

response_synthesizer = get_response_synthesizer(
    response_mode = "tree_summarize", use_async=True
)

doc_summary_index = DocumentSummaryIndex.from_documents(
    docs,
    llm=llm,
    transformations=[text_splitter],
    response_synthesizer=response_synthesizer,
    show_progress=True)

print("--- %s seconds ---" % (time.time() - start_time))

Parsing nodes:   0%|          | 0/257 [00:00<?, ?it/s]

Summarizing documents:   0%|          | 0/257 [00:00<?, ?it/s]

current doc id: Decarbonizing-the-Built-World-A-Call-to-Action-2023-03-07
current doc id: 01e62995-0ba7-473a-b4b8-ee5aa1409797
current doc id: 99eab8c3-0d7b-48c4-8f01-f69084e81da4
current doc id: 4df4354b-0311-4345-8da0-6b6009621f77
current doc id: 68ce58be-40d4-483a-98ef-ece579a09bcf
current doc id: 8806fd2d-cdf2-44fc-9611-1f1dcf2f7f20
current doc id: 60337c0a-ab62-43db-b2fb-1db811449000
current doc id: d837134a-5f85-4497-8367-c9c83689d9b7
current doc id: 8bc0f6ed-08e1-4d2a-8a12-9b39e963d194
current doc id: 05f7f0bc-b4d7-4229-ae4f-9e0c167bb920
current doc id: 331b8e64-5eac-4244-8cab-331193444a2b
current doc id: 48ef52a7-7057-40e0-a25c-09b1f9ba81a6
current doc id: 1ebc7a8c-21ab-4bcd-ba4a-16e621b66299
current doc id: 4836b878-97c9-4dae-bcd9-b501ff3b3a02
current doc id: 204fadc0-07c5-4d47-99ee-cd8bb06e0d20
current doc id: Digital-Twin-Capabilities-Periodic-Table-User-Guide
current doc id: 9c2ee515-099a-4aa5-a1f7-850dac20a700
current doc id: ab5de785-4433-49ec-86c8-302b2956780c
current do

Generating embeddings:   0%|          | 0/257 [00:00<?, ?it/s]

--- 633.0902972221375 seconds ---


In [9]:
doc_summary_index.get_document_summary("b934b5e7-609d-4e90-bcc8-c75046bc7db6")

"The text discusses the importance of embedding performance evaluation throughout all stages of a building's lifecycle to achieve decarbonization in the built environment. It emphasizes the role of a digital twin in providing data-driven information to uncover energy, carbon, capital, and operational savings. The text also highlights the benefits of using robust physics-based modeling from the conceptual design stages to building operation, such as improving energy efficiency, indoor comfort, environmental design, and future adaptability of buildings.\n\nSome questions that this text can answer include:\n- How can performance evaluation be integrated into all stages of a building's lifecycle?\n- What is the role of a digital twin in decarbonizing the built environment?\n- What are the benefits of using physics-based modeling in building design and operation?\n- How can discussions about key performance metrics be initiated in the early stages of a building project?\n- Why is it importa

In [10]:
### Question Extraction ###

'''import re
input_text = doc_summary_index.get_document_summary("644a1344-e8b0-46f2-b81e-792e3124dca2")
question_pattern1 = r'\n\n- (.*[?])'
question_pattern2 = r'\n\n- (.*?)(?:\n\n|$)'
if re.findall(question_pattern1, input_text):
    sample_qs = re.findall(question_pattern1, input_text)
else:
    sample_qs = re.findall(question_pattern2, input_text)
sample_qs'''    

'import re\ninput_text = doc_summary_index.get_document_summary("644a1344-e8b0-46f2-b81e-792e3124dca2")\nquestion_pattern1 = r\'\n\n- (.*[?])\'\nquestion_pattern2 = r\'\n\n- (.*?)(?:\n\n|$)\'\nif re.findall(question_pattern1, input_text):\n    sample_qs = re.findall(question_pattern1, input_text)\nelse:\n    sample_qs = re.findall(question_pattern2, input_text)\nsample_qs'

In [14]:
## Storing local vector store

from llama_index.core import load_index_from_storage
from llama_index.core import StorageContext

doc_summary_index.storage_context.persist("ritika_vectorstore")
storage_context = StorageContext.from_defaults(persist_dir="ritika_vectorstore")
doc_summary_index = load_index_from_storage(storage_context)

### Embeddings retreiver

In [20]:
#from llama_index.core.prompts.base import Prompt
#from llama_index.core.prompts.prompt_type import PromptType
from llama_index.core import PromptTemplate
from IPython.display import Markdown, display

def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

new_summary_tmpl_str = (
    "Context information is below:\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query in the style of a consultant at McKinsey who specializes in digital transformation strategies."
    "Your goal is to help business users succeed in their digital transformation journeys."
    "Do not use any context outside of these documents."
    "You should maintain a friendly yet professional tone."
    "Use crisp bullet points whenever relevant.\n"
    "Query: {query_str}\n"
    "Answer: "
)

new_summary_tmpl = PromptTemplate(new_summary_tmpl_str)

In [21]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.document_summary import (
    DocumentSummaryIndexEmbeddingRetriever,
)

# Configuring response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

## Creating Retreiver object
retriever = DocumentSummaryIndexEmbeddingRetriever(
    doc_summary_index,
    choice_batch_size=10,
    choice_top_k=5
)

# Assembling query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)

## Checking default query prompt:
prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

## Modifying query prompt

query_engine.update_prompts(
    {"response_synthesizer:summary_template": new_summary_tmpl}
)

## Checking modified query prompt:
prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)



**Prompt Key**: response_synthesizer:summary_template<br>**Text:** <br>

Context information from multiple sources is below.
---------------------
{context_str}
---------------------
Given the information from multiple sources and not prior knowledge, answer the query.
Query: {query_str}
Answer: 


<br><br>

**Prompt Key**: response_synthesizer:summary_template<br>**Text:** <br>

Context information is below:
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query in the style of a consultant at McKinsey who specializes in digital transformation strategies.Your goal is to help business users succeed in their digital transformation journeys.Do not use any context outside of these documents.You should maintain a friendly yet professional tone.Use crisp bullet points whenever relevant.
Query: {query_str}
Answer: 


<br><br>

In [40]:
### Generating response ###

start_time = time.time()
# query
response = query_engine.query("Detail the elements of Platform Stack Architectural Framework?")
print(response)

print("--- %s seconds ---" % (time.time() - start_time))

- The Platform Stack Architectural Framework consists of various layers that work together to support digital transformation efforts:
  - Infrastructure Layer: This includes the hardware and software components that form the foundation of the platform.
  - Platform Layer: This layer provides the core services and capabilities that enable applications to run on top of the infrastructure.
  - Application Layer: Here, specific applications and services are built and deployed to meet business needs.
- Each layer in the framework plays a crucial role in enabling digital transformation:
  - The Infrastructure Layer ensures the reliability and scalability of the platform.
  - The Platform Layer provides the necessary tools and services for developers to build and deploy applications efficiently.
  - The Application Layer delivers the end-user experience and functionality that drive business value.
- By understanding and optimizing each layer of the Platform Stack Architectural Framework, busi

In [None]:
### Checking the nodes rretreived to compare with source
'''retrieved_nodes = retriever.retrieve("What is continuous Model-Based Commissioning?")
print(len(retrieved_nodes))
print(retrieved_nodes[0].score)
print(retrieved_nodes[0].node.get_text())'''

In [None]:
## Fin ##