In [1]:
# https://medium.com/@aminajavaid30/building-a-rag-system-synthesis-67f36efa7c35

# Data Ingestion & Retrieval
import bs4
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_loaders import GithubFileLoader
from bs4 import BeautifulSoup



In [None]:
# Load a github repo 

if not os.environ.get("GITHUB_PAT"):
  os.environ["GITHUB_PAT"] = getpass.getpass("Enter Github PAT: ")
  
async def github_repo(repo_name: str) -> list[Document]:
    loader = GithubFileLoader(
        repo=repo_name, # the repo name
        access_token=os.environ["GITHUB_PAT"],
        github_api_url="https://api.github.com",
        branch ="master", 
        file_filter=lambda file_path: file_path.endswith(
            ".md"
        ),  # load all markdowns files.
    )
    github_docs = []
    async for doc in loader.alazy_load():
        github_docs.append(doc)
    return github_docs

githubDoc = await github_repo("xBimTeam/XbimEssentials")
print(len(githubDoc))

8


In [6]:
githubDoc2 = await github_repo("xBimTeam/XbimWebUI")
print(len(githubDoc2))

4


In [7]:
# Load a website reccusivly and split it into chunks
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

async def website_parser(base_url: str) -> list[Document]:
    loader = RecursiveUrlLoader(
        base_url,
        extractor=bs4_extractor,
        max_depth=2,
        prevent_outside=True,
        # use_async=False,
        # extractor=None,
        # metadata_extractor=None,
        # exclude_dirs=(),
        # timeout=10,
        # check_response_status=True,
        # continue_on_failure=True,
        # 
        # base_url=None,
        # ...
    )
    site_docs = []
    async for doc in loader.alazy_load():
        site_docs.append(doc)
    return site_docs

website_docs = await website_parser("https://docs.xbim.net/")
print(len(website_docs))

32


In [9]:
# Initialize the recursive character text splitter
text_splitter = RecursiveCharacterTextSplitter(              
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# Split the documents into chunks
documents = website_docs + githubDoc + githubDoc2
chunks = text_splitter.split_documents(documents)
print(len(documents))
print(len(chunks))

44
460


In [10]:
# Initialize the OpenAI embedding model
import getpass
import os

from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter API key for Azure: ")

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="text-embedding-3-small",
    openai_api_version="2025-01-01-preview",
)

# Store embeddings into the vector store
vector_store = InMemoryVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings
)

# Retrieve relevant information using similarity search
retriever = vector_store.as_retriever() # uses similarity search by default

Enter API key for Azure:  ········


In [11]:
# Q/A Retrieval Chain

from langchain_openai import AzureChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Initialize the Azure AI LLM with the specific model
llm = AzureChatOpenAI(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="gpt-4o-mini",
    openai_api_version="2025-01-01-preview",
)

# Define the system prompt that instructs the LLM how to answer questions based on retrieved context
system_prompt = (
    "You are a specialist of Building Information modeling looking for a web-based building model viewer"    
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "Add sources like links to your answer."
    "If you don't know the answer, say that you don't know."
    "Use four sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"  # Placeholder for the retrieved context
)

# Create a chat prompt template with a system message and human message
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),  # System message contains instructions and context
        ("human", "{input}"),  # Human message includes the user's input question
    ]
)

# Create a document chain that combines the LLM with the prompt
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Combine the retrieval chain with the question-answering chain 
# The retrieval chain retrieves relevant documents and feeds them into the question-answering chain
retrieval_chain = create_retrieval_chain(retriever, question_answer_chain)

In [12]:
# Invoke the chain with a question, and the retriever will provide context for the LLM to generate an answer
response = retrieval_chain.invoke({"input": "Please describe the software editor behind this solution (name, description, history, "})
print(response['answer'])

The software editor behind this solution is the Xbim toolkit. It is an open-source library designed for building information modeling (BIM) and enables developers to create, edit, and visualize IFC (Industry Foundation Classes) models. Over the years, Xbim has matured significantly, becoming a stable and comprehensive framework for BIM applications. It provides essential tools for managing geometry and data manipulation, making it a valuable resource for architects and engineers in their BIM workflows. For more information, you can visit [Xbim's official website](http://docs.xbim.net/).


In [13]:
response = retrieval_chain.invoke({"input": "What is the price or licencing model of the solution?"})
print(response['answer'])

The xbim toolkit is an open-source software development toolkit, which means it is available for free under its associated licenses. There are no mentioned fees or pricing models for using the xbim toolkit to read, create, or view Building Information Models. You can find more information regarding licenses on their official website [here](https://xbimteam.github.io/).


In [14]:
response = retrieval_chain.invoke({"input": "Can you list all the input format accepted by the solution?"})
print(response['answer'])

The Xbim Toolkit supports the following input formats: STEP, IfcXml, and IfcZip. It enables developers to read and write the full schema of IFC2x3, as well as handle BuildingSmart Building Collaboration Format (BCF), COBie Export, and BuildingSmart mvdXML. For more information, you can visit the [Xbim documentation site](http://docs.xbim.net/).


In [16]:
response = retrieval_chain.invoke({"input": "What is the format used in the web viewer ?"})
print(response['answer'])

The web viewer uses the WexBIM data format as its input, which is a custom binary data format produced using core xBIM Libraries. For more details on creating WexBIM files, you can refer to the documentation [here](http://docs.xbim.net/examples/creating-wexbim-file.html).


In [17]:
response = retrieval_chain.invoke({"input": "What is the WexBIM data format?"})
print(response['answer'])

WexBIM is a data format used for web presentation of building models, specifically optimized for use with the xBIM toolkit. It contains geometry data converted from IFC (Industry Foundation Classes) files, allowing for efficient 3D visualization in web-based applications. The WexBIM format enables the visualization of building information models in a lightweight manner suitable for online environments. For more information, you can visit the xBIM website: [xBIM](http://docs.xbim.net).


In [18]:
response = retrieval_chain.invoke({"input": "What are the main programming language used in the solution?"})
print(response['answer'])

The main programming languages used in the xbim solution are C# for backend code generation and JavaScript for the web-based visualization using WebGL. Additionally, T4 templates and GPPG/GPLEX parser generator are utilized for code generation purposes. The toolkit is designed to be integrated and utilized in .NET environments.


In [19]:
response = retrieval_chain.invoke({"input": "What are the licences for Xbim Toolkit?"})
print(response['answer'])

The Xbim Toolkit is made available under the CDDL Open Source license. This license supports commercial usage of the XBIM system within a 'Larger Work', as long as the license agreements are honored. Additionally, the toolkit uses various third-party software packages, each under their respective licenses, including the New BSD License, OPEN CASCADE Public License, MS Permissive License, Apache 2.0 License, and MIT License. For more details, you can refer to the official documentation [here](https://github.com/xBimTeam/XbimWindowsUI).
