In [1]:
# https://medium.com/@aminajavaid30/building-a-rag-system-synthesis-67f36efa7c35

# Data Ingestion & Retrieval
import bs4
import re
import os
import getpass
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_loaders import GithubFileLoader
from bs4 import BeautifulSoup



In [2]:
# Load a github repo

if not os.environ.get("GITHUB_PAT"):
  os.environ["GITHUB_PAT"] = getpass.getpass("Enter Github PAT: ")

async def github_repo(repo_name: str) -> list[Document]:
    loader = GithubFileLoader(
        repo=repo_name, # the repo name
        access_token= os.environ["GITHUB_PAT"],
        github_api_url="https://api.github.com",
        file_filter=lambda file_path: file_path.endswith(
            ".md"
        ),  # load all markdowns files.
    )
    github_docs = []
    async for doc in loader.alazy_load():
        github_docs.append(doc)
    return github_docs

In [3]:
# Load a website reccusivly and split it into chunks
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

async def website_parser(base_url: str) -> list[Document]:
    loader = RecursiveUrlLoader(
        base_url,
        extractor=bs4_extractor,
        max_depth=2,
        prevent_outside=True,
        # use_async=False,
        # extractor=None,
        # metadata_extractor=None,
        # exclude_dirs=(),
        # timeout=10,
        # check_response_status=True,
        # continue_on_failure=True,
        # 
        # base_url=None,
        # ...
    )
    site_docs = []
    async for doc in loader.alazy_load():
        site_docs.append(doc)
    return site_docs

In [4]:
# Initialize the OpenAI embedding model
import getpass
import os

from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

if not os.environ.get("AZURE_OPENAI_API_KEY"):
  os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter API key for Azure: ")

azureEmbeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="text-embedding-3-small",
    openai_api_version="2025-01-01-preview",
)

# Initialize the recursive character text splitter
text_splitter = RecursiveCharacterTextSplitter(              
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# Initialize with an embedding model
vector_store = InMemoryVectorStore(embedding=azureEmbeddings)

In [5]:
# Define a list of site to parse
urls = ["https://docs.thatopen.com/", "https://thatopen.com/"]

# Iterate over each urls in the list
for url in urls:
    # Retrive the documents
    website_docs = await website_parser(url)
    website_docs
    print(f"{len(website_docs)} docs in {url}")
    # Split the documents into chunks
    chunks = text_splitter.split_documents(website_docs)
    print(f"{len(chunks)} chunks in {url}")
    # Add embeddings into the vector store
    vector_store.add_documents(documents=chunks)



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html, "lxml")

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(raw_html, "html.parser")


2 docs in https://docs.thatopen.com/
2 chunks in https://docs.thatopen.com/
4 docs in https://thatopen.com/
42 chunks in https://thatopen.com/


In [6]:
# Define a list of github repo to parse
repos = ["ThatOpen/engine_web-ifc", "ThatOpen/engine_components","ThatOpen/engine_ui-components"]

# Iterate over each urls in the list
for repo in repos:
    # Retrive the documents
    github_docs = await github_repo(repo)
    print(f"{len(github_docs)} docs in {repo}")
    # Split the documents into chunks
    chunks = text_splitter.split_documents(github_docs)
    print(f"{len(chunks)} chunks in {repo}")
    # Add embeddings into the vector store
    vector_store.add_documents(documents=chunks)

3 docs in ThatOpen/engine_web-ifc
34 chunks in ThatOpen/engine_web-ifc
4 docs in ThatOpen/engine_components
93 chunks in ThatOpen/engine_components
4 docs in ThatOpen/engine_ui-components
31 chunks in ThatOpen/engine_ui-components


In [7]:
# Retrieve relevant information using similarity search
retriever = vector_store.as_retriever() # uses similarity search by default

In [8]:
# Q/A Retrieval Chain

from langchain_openai import AzureChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Initialize the Azure AI LLM with the specific model
llm = AzureChatOpenAI(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="gpt-4o-mini",
    openai_api_version="2025-01-01-preview",
)

# Define the system prompt that instructs the LLM how to answer questions based on retrieved context
system_prompt = (
    "You are a specialist of Building Information modeling looking for a web-based building model viewer"    
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "Add sources like links to your answer."
    "If you don't know the answer, say that you don't know."
    "Use four sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"  # Placeholder for the retrieved context
)

# Create a chat prompt template with a system message and human message
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),  # System message contains instructions and context
        ("human", "{input}"),  # Human message includes the user's input question
    ]
)

# Create a document chain that combines the LLM with the prompt
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Combine the retrieval chain with the question-answering chain 
# The retrieval chain retrieves relevant documents and feeds them into the question-answering chain
retrieval_chain = create_retrieval_chain(retriever, question_answer_chain)

In [9]:
# Invoke the chain with a question, and the retriever will provide context for the LLM to generate an answer
response = retrieval_chain.invoke({"input": "Please describe the software editor behind this solution (name, description, history, "})
print(response['answer'])

The software editor is not explicitly named in the provided context, but it appears to be associated with an open-source BIM technology that focuses on the development of 3D applications and integrates BIM editing capabilities into existing applications. It offers functionalities for creating both 2D and 3D graphical environments intended to solve real problems in the AECO (Architecture, Engineering, Construction, and Operations) industry. The development history includes multiple updates and improvements, particularly within the context of enhancing coordination logic, visual highlighters, and geometry streaming. For more details, you can explore the updates on GitHub [here](https://github.com/ThatOpen/engine_components).


In [10]:
response = retrieval_chain.invoke({"input": "What is the price or licencing model of the solution?"})
print(response['answer'])

The retrieved context does not provide specific information regarding the price or licensing model of the web-based building model viewer solution. You may need to check the product's official website or contact the provider directly for accurate pricing details.


In [11]:
response = retrieval_chain.invoke({"input": "Can you list all the input format accepted by the solution?"})
print(response['answer'])

I don't know.


In [12]:
response = retrieval_chain.invoke({"input": "What is the format used in the web viewer ?"})
print(response['answer'])

The web viewer utilizes the IFC (Industry Foundation Classes) format, which is a standard for building information modeling (BIM). This format allows for the representation of various aspects of building design and construction data. You can find more information about working with IFC files through the web-ifc JavaScript library, which enables reading and writing of IFC files at native speeds. For further details, you can visit the [web-ifc documentation](https://thatopen.github.io/engine_web-ifc/docs).


In [13]:
response = retrieval_chain.invoke({"input": "What is the WexBIM data format?"})
print(response['answer'])

The WexBIM data format is a web-based model format designed for efficient visualization and interaction with 3D building models in web applications. It enables the integration of BIM data with web technologies, optimizing performance for rendering complex models. WexBIM supports various features such as collaboration and access to metadata within the model. For more information, you can visit the WexBIM official page at [WexBIM](https://www.wexbim.com).


In [14]:
response = retrieval_chain.invoke({"input": "What are the main programming language used in the solution?"})
print(response['answer'])

The main programming languages used in the web-based building model viewer solutions include HTML, CSS, JavaScript, and TypeScript. Additionally, technologies like Three.js, and frameworks such as React are also utilized for developing 3D applications and creating functionalities in graphical environments. For further information, you might explore the resources provided by That Open Engine and related platforms.


In [16]:
response = retrieval_chain.invoke({"input": "What are the licences for this solution?"})
print(response['answer'])

The licenses for this solution include a world-wide, royalty-free, non-exclusive license under intellectual property rights to use, reproduce, modify, display, and distribute contributions from the contributors. However, the licenses granted do not provide rights for any code removed from the software or for modifications made by you or third parties that infringe on patents. Additionally, there are limitations regarding patents and trademarks, as no patent license is granted for infringements caused by specific modifications or combinations of software. For more detailed information, please refer to the applicable license documentation.
