### Package dependencies

## Document Ingestion Pipeline

In [None]:
### ID of SME for data labeling

sme_id = 'sme_001'

In [None]:
import os
import openai
import time

os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer, StorageContext
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter

#### Parsing files and loading documents

In [None]:
data_root = "./ritika_data/"

In [None]:
## Ingesting documents from Local

filenames = ["Decarbonizing-the-Built-World-A-Call-to-Action-2023-03-07",
             "Digital-Twin-Capabilities-Periodic-Table-User-Guide",
             "Digital-Twin-System-Interoperability-Framework-12072021",
             "DTC-Reality-Capture-Industry-User-Guide-for-Tenant-Improvement-Projects-2023-06-07",
             "Infrastructure-Digital-Twin-Maturity-Model"]
             #"Platform-Stack-Architectural-Framework",
             #"Reality-Capture-A-Digital-Twin-Foundation",
             #"SMM-Digital-Twin-Profile-2022-06-20",
             #"User-Guide-1-Why-and-What-2023-07-18",
             #"User-Guide-2-Identifying-and-Aligning",
             #"User-Guide-3_A-Whole_Systems_Approach"]

docs = []

for filename in filenames:
    doc = SimpleDirectoryReader(input_files=[f"{data_root}/{filename}.pdf"]).load_data()
    doc[0].doc_id = filename.replace(".pdf","")
    docs.extend(doc)

#### Redis-Based Ingestion Pipeline for Doc store and Vector store

In [None]:
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.redis import RedisVectorStore
from llama_index.storage.index_store.redis import RedisIndexStore

In [None]:
llm = OpenAI(model="gpt-3.5-turbo-0125", temperature=0)
text_splitter = SentenceSplitter(chunk_size=3000)
embed_model = OpenAI(model="text-embedding-3-small")
text_splitter = SentenceSplitter(chunk_size=1024)

#### Setting up document store, index store and vector store

In [None]:
## Setting up connection to Redis two ways:

## Through Python Redis client:

import redis
redis_client = redis.Redis(
  host='',
  port=,
  password='')

## Through host and port:

#REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
#REDIS_PORT = os.getenv("REDIS_PORT", 6379)

In [None]:
## Defining PGVector database:

from sqlalchemy import make_url
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore

url = make_url("postgres://postgres.<username>:<password>@aws-0-us-west-1.pooler.supabase.com:5432/postgres")
db_name="postgres"
vector_store = PGVectorStore.from_params(
    database=url.database,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name=sme_id,
    embed_dim=1536,  # openai embedding dimension
)

#### Creating storage context

In [None]:
## Through Redis client:

from llama_index.core import SimpleDirectoryReader

storage_context = StorageContext.from_defaults(
    docstore=RedisDocumentStore.from_redis_client(
        redis_client=redis_client, namespace=sme_id
    ),
    
    index_store=RedisIndexStore.from_redis_client(
        redis_client=redis_client, namespace=sme_id
    ),
    vector_store = vector_store
)

response_synthesizer = get_response_synthesizer(
    response_mode = "tree_summarize", use_async=True
)

In [None]:
## Through Redis host and port:

storage_context = StorageContext.from_defaults(
    docstore=RedisDocumentStore.from_redis_client(
        host=REDIS_HOST, port=REDIS_PORT, namespace="llama_index"
    ),
    
    index_store=RedisIndexStore.from_redis_client(
        host=REDIS_HOST, port=REDIS_PORT, namespace="llama_index"
    ),
    vector_store = vector_store
)

response_synthesizer = get_response_synthesizer(
    response_mode = "tree_summarize", use_async=True
)

#### Creating Document Summary Index and storing in respective db locations

In [None]:
doc_summary_index = DocumentSummaryIndex.from_documents(docs,
    llm=llm,
    transformations=[text_splitter],
    response_synthesizer=response_synthesizer,
    show_progress=True,
    storage_context = storage_context)

In [None]:
### Fin ###

In [None]:
### Restart kernel to test Pipeline below:

## Document Retreival Pipeline

In [None]:
### ID of SME for data labeling

sme_id = 'sme_001'

In [None]:
import os
import openai
import time

os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer, StorageContext
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter

#### Recreating storage context object using same DB connections from ingestion pipeline

In [None]:
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.redis import RedisVectorStore
from llama_index.storage.index_store.redis import RedisIndexStore

In [None]:
llm = OpenAI(model="gpt-3.5-turbo-0125", temperature=0)
text_splitter = SentenceSplitter(chunk_size=3000)
embed_model = OpenAI(model="text-embedding-3-small")
text_splitter = SentenceSplitter(chunk_size=1024)

In [None]:
## Connecting to Redis DB through Python Redis Client (Refer to ingestion pipeline above for Redis Host-Port access)

import redis
redis_client = redis.Redis(
  host='',
  port=,
  password='')

## Setting up PGVector database

from sqlalchemy import make_url
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore

url = make_url("postgres://postgres.<username>:<password>@aws-0-us-west-1.pooler.supabase.com:5432/postgres")
db_name="postgres"
vector_store = PGVectorStore.from_params(
    database=url.database,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name=sme_id,
    embed_dim=1536,  # openai embedding dimension
)


In [None]:
## Re-creating Storage context through Redis client (Refer to ingestion pipeline above for Redis Host-Port access):

from llama_index.core import SimpleDirectoryReader

storage_context = StorageContext.from_defaults(
    docstore=RedisDocumentStore.from_redis_client(
        redis_client=redis_client, namespace=sme_id
    ),
    index_store=RedisIndexStore.from_redis_client(
        redis_client=redis_client, namespace=sme_id
    ),
    vector_store = vector_store
)

response_synthesizer = get_response_synthesizer(
    response_mode = "tree_summarize", use_async=True
)

#### Creating steps to final response:

In [None]:
## Creating prompt template

from llama_index.core import PromptTemplate
from IPython.display import Markdown, display

def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

new_summary_tmpl_str = (
    "Context information is below:\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge,"
    "answer the query in the style of a McKinsey, Bain or BCG consultant who specializes in digital transformation strategies."
    "Your goal is to help business users succeed in their digital transformation journeys."
    "Do not use any context outside of these documents."
    "If a question is outside of your area of expertise, politely refuse to answer and suggest alternative topics of discussion from the context provided."
    "You should maintain a friendly yet professional tone."
    "Use detailed bullet points whenever relevant.\n"
    "Query: {query_str}\n"
    "Answer: "
)

new_summary_tmpl = PromptTemplate(new_summary_tmpl_str)

#### Loading document summary index using storage context:

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.document_summary import (
    DocumentSummaryIndexEmbeddingRetriever,
)

from llama_index.core import load_index_from_storage

doc_summary_index = load_index_from_storage(
    storage_context=storage_context
)

# Configuring response synthesizer
response_synthesizer = get_response_synthesizer(streaming=True, response_mode="tree_summarize")

## Creating Retreiver object
retriever = DocumentSummaryIndexEmbeddingRetriever(
    doc_summary_index,
    choice_batch_size=10,
    choice_top_k=5
)

# Assembling query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)

## Checking default query prompt:
prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

## Modifying query prompt

query_engine.update_prompts(
    {"response_synthesizer:summary_template": new_summary_tmpl}
)

## Checking modified query prompt:
prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

### Implementing embeddings Retirever

In [None]:
start_time = time.time()
# query
response = query_engine.query("What are the most important considerations when implementing a digital twin?")
response.print_response_stream()

## Use streaming response in block:
'''for text in streaming_response.response_gen:
    # do something with text as they arrive.
    pass
'''

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
### Fin ###

In [None]:
### Restart kernel to test Pipeline below:

## SME Data Insertion-Deletion (updation) pipeline

In [16]:
### ID of SME for data labeling

sme_id = 'sme_001'

In [17]:
import os
import openai
import time

os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.environ["OPENAI_API_KEY"]

In [18]:
import nest_asyncio
nest_asyncio.apply()

In [19]:
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer, StorageContext
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter

#### Parsing new files

In [20]:
data_root = "./ritika_data/"

In [21]:
filenames = [#"Decarbonizing-the-Built-World-A-Call-to-Action-2023-03-07",
             #"Digital-Twin-Capabilities-Periodic-Table-User-Guide",
             #"Digital-Twin-System-Interoperability-Framework-12072021",
             #"DTC-Reality-Capture-Industry-User-Guide-for-Tenant-Improvement-Projects-2023-06-07",
             #"Infrastructure-Digital-Twin-Maturity-Model"]
             "Platform-Stack-Architectural-Framework",
             "Reality-Capture-A-Digital-Twin-Foundation",
             "SMM-Digital-Twin-Profile-2022-06-20",
             "User-Guide-1-Why-and-What-2023-07-18",
             "User-Guide-2-Identifying-and-Aligning",
             "User-Guide-3_A-Whole_Systems_Approach"]

new_docs = []

for filename in filenames:
    doc = SimpleDirectoryReader(input_files=[f"{data_root}/{filename}.pdf"]).load_data()
    doc[0].doc_id = filename.replace(".pdf","")
    new_docs.extend(doc)

#### Redis-Based Ingestion Pipeline for Doc store and Vector store

In [22]:
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.redis import RedisVectorStore
from llama_index.storage.index_store.redis import RedisIndexStore

In [23]:
llm = OpenAI(model="gpt-3.5-turbo-0125", temperature=0)
text_splitter = SentenceSplitter(chunk_size=3000)
embed_model = OpenAI(model="text-embedding-3-small")
text_splitter = SentenceSplitter(chunk_size=1024)

#### Setting up document store, index store and vector store

In [24]:
## Setting up connection to Redis two ways:

## Through Python Redis client:

import redis
redis_client = redis.Redis(
  host='',
  port=,
  password='')

In [25]:
## Defining PGVector database:

from sqlalchemy import make_url
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore

url = make_url("postgres://postgres.<username>:<password>@aws-0-us-west-1.pooler.supabase.com:5432/postgres")
db_name="postgres"
vector_store = PGVectorStore.from_params(
    database=url.database,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name=sme_id,
    embed_dim=1536,  # openai embedding dimension
)

#### Loading Index from storage

In [26]:
## Re-creating Storage context through Redis client (Refer to ingestion pipeline above for Redis Host-Port access):

from llama_index.core import SimpleDirectoryReader

storage_context = StorageContext.from_defaults(
    docstore=RedisDocumentStore.from_redis_client(
        redis_client=redis_client, namespace=sme_id
    ),
    index_store=RedisIndexStore.from_redis_client(
        redis_client=redis_client, namespace=sme_id
    ),
    vector_store = vector_store
)

response_synthesizer = get_response_synthesizer(
    response_mode = "tree_summarize", use_async=True
)

In [27]:
from llama_index.core import load_index_from_storage

doc_summary_index = load_index_from_storage(
    storage_context=storage_context
)

In [28]:
doc_summary_index.refresh(new_docs)

current doc id: Platform-Stack-Architectural-Framework
current doc id: 3995e90f-10a5-4ab0-bd41-1c493ea06d48
current doc id: 2a04a3ad-e4f1-4372-9462-ac718d1b1dd6
current doc id: 29f72723-a6ea-48d9-a9ea-d3cbd57244d8
current doc id: c3dda7fb-ab2b-440d-b341-4db9adb08bcf
current doc id: b577acb1-0bb4-4ff2-958a-33195f6bd4c2
current doc id: 830e0fa8-d46c-43e2-ac7c-85934ee21ea4
current doc id: 7b55aec5-630d-464e-b945-af640049ba9b
current doc id: 9fed0941-8cee-4202-8b64-2d6ba83c678f
current doc id: ad698a3d-e128-474a-ab4c-a1a3c6643a34
current doc id: 79b035e5-7721-46cc-a25d-ff2525d2110d
current doc id: 38e51a78-df7d-4ef3-ab95-fdb3cfed5a7a
current doc id: f6921b4f-d892-40c9-8566-1d176113b08d
current doc id: ca2563e3-ad96-4ae9-90e8-031c76915d25
current doc id: 42315617-a4a5-4d96-bb08-8be63c21b798
current doc id: 5ccc2b82-c456-471f-83da-be49726ee223
current doc id: fe8522a1-825b-4421-9de8-92accd7a3b29
current doc id: 8fc521d3-e508-4f36-b957-9b37b103f665
current doc id: 4f886d51-d39f-4e0f-a305-86b7

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,

### Checking response after adding new data

In [31]:
## Creating prompt template

from llama_index.core import PromptTemplate
from IPython.display import Markdown, display

def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

new_summary_tmpl_str = (
    "Context information is below:\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge,"
    "answer the query in the style of a McKinsey, Bain or BCG consultant who specializes in digital transformation strategies."
    "Your goal is to help business users succeed in their digital transformation journeys."
    "Do not use any context outside of these documents."
    "If a question is outside of your area of expertise, politely refuse to answer and suggest alternative topics of discussion from the context provided."
    "You should maintain a friendly yet professional tone."
    "Use detailed bullet points whenever relevant.\n"
    "Query: {query_str}\n"
    "Answer: "
)

new_summary_tmpl = PromptTemplate(new_summary_tmpl_str)

In [32]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.document_summary import (
    DocumentSummaryIndexEmbeddingRetriever,
)

from llama_index.core import load_index_from_storage

#doc_summary_index = load_index_from_storage(
#    storage_context=storage_context
#)

# Configuring response synthesizer
response_synthesizer = get_response_synthesizer(streaming=True, response_mode="tree_summarize")

## Creating Retreiver object
retriever = DocumentSummaryIndexEmbeddingRetriever(
    doc_summary_index,
    choice_batch_size=10,
    choice_top_k=5
)

# Assembling query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)

## Checking default query prompt:
prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

## Modifying query prompt

query_engine.update_prompts(
    {"response_synthesizer:summary_template": new_summary_tmpl}
)

## Checking modified query prompt:
prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

**Prompt Key**: response_synthesizer:summary_template<br>**Text:** <br>

Context information from multiple sources is below.
---------------------
{context_str}
---------------------
Given the information from multiple sources and not prior knowledge, answer the query.
Query: {query_str}
Answer: 


<br><br>

**Prompt Key**: response_synthesizer:summary_template<br>**Text:** <br>

Context information is below:
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge,answer the query in the style of a McKinsey, Bain or BCG consultant who specializes in digital transformation strategies.Your goal is to help business users succeed in their digital transformation journeys.Do not use any context outside of these documents.If a question is outside of your area of expertise, politely refuse to answer and suggest alternative topics of discussion from the context provided.You should maintain a friendly yet professional tone.Use detailed bullet points whenever relevant.
Query: {query_str}
Answer: 


<br><br>

### Implementing embeddings Retirever

In [34]:
start_time = time.time()
# query
response = query_engine.query("What is the capital of south sudan?")
response.print_response_stream()

## Use streaming response in block:
'''for text in streaming_response.response_gen:
    # do something with text as they arrive.
    pass
'''

print("--- %s seconds ---" % (time.time() - start_time))

I'm sorry, but the question about the capital of South Sudan is outside the scope of the context provided, which focuses on decarbonization across the building lifecycle. 

However, I can provide insights and recommendations related to sustainability assessment methods and certification tools for building projects. Would you like to discuss how to choose the best sustainability assessment method for your project or how to track key performance metrics across the design lifecycle?--- 1.9458940029144287 seconds ---
