In [1]:
# https://medium.com/@aminajavaid30/building-a-rag-system-synthesis-67f36efa7c35

# Data Ingestion & Retrieval
import bs4
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_loaders import GithubFileLoader
from bs4 import BeautifulSoup



In [2]:
# Load a github repo

if not os.environ.get("GITHUB_PAT"):
  os.environ["GITHUB_PAT"] = getpass.getpass("Enter Github PAT: ")

async def github_repo(repo_name: str) -> list[Document]:
    loader = GithubFileLoader(
        repo=repo_name, # the repo name
        access_token= os.environ["GITHUB_PAT"],
        github_api_url="https://api.github.com",
        branch ="master", 
        file_filter=lambda file_path: file_path.endswith(
            ".md"
        ),  # load all markdowns files.
    )
    github_docs = []
    async for doc in loader.alazy_load():
        github_docs.append(doc)
    return github_docs

In [3]:
# Load a website reccusivly and split it into chunks
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

async def website_parser(base_url: str) -> list[Document]:
    loader = RecursiveUrlLoader(
        base_url,
        extractor=bs4_extractor,
        max_depth=2,
        prevent_outside=True,
        # use_async=False,
        # extractor=None,
        # metadata_extractor=None,
        # exclude_dirs=(),
        # timeout=10,
        # check_response_status=True,
        # continue_on_failure=True,
        # 
        # base_url=None,
        # ...
    )
    site_docs = []
    async for doc in loader.alazy_load():
        site_docs.append(doc)
    return site_docs

In [4]:
# Initialize the OpenAI embedding model
import getpass
import os

from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

if not os.environ.get("AZURE_OPENAI_API_KEY"):
  os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter API key for Azure: ")

azureEmbeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="text-embedding-3-small",
    openai_api_version="2025-01-01-preview",
)

# Initialize the recursive character text splitter
text_splitter = RecursiveCharacterTextSplitter(              
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# Initialize with an embedding model
vector_store = InMemoryVectorStore(embedding=azureEmbeddings)

In [None]:
# Define a list of site to parse
urls = ["https://bimdata.io/", "https://developers.bimdata.io/"]

# Iterate over each urls in the list
for url in urls:
    # Retrive the documents
    website_docs = await website_parser(url)
    website_docs
    print(f"{len(website_docs)} docs in {url}")
    # Split the documents into chunks
    chunks = text_splitter.split_documents(website_docs)
    print(f"{len(chunks)} chunks in {url}")
    # Add embeddings into the vector store
    vector_store.add_documents(documents=chunks)



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html, "lxml")

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(raw_html, "html.parser")


32 docs in https://bimdata.io/
893 chunks in https://bimdata.io/


CancelledError: 

In [50]:
# Define a list of github repo to parse
repos = ["bimdata/platform", "bimdata/platform-back"]

# Iterate over each urls in the list
for repo in repos:
    # Retrive the documents
    github_docs = await github_repo(repo)
    print(f"{len(github_docs)} docs in {repo}")
    # Split the documents into chunks
    chunks = text_splitter.split_documents(github_docs)
    print(f"{len(chunks)} chunks in {repo}")
    # Add embeddings into the vector store
    vector_store.add_documents(documents=chunks)

3 docs in bimdata/platform
140 chunks in bimdata/platform
1 docs in bimdata/platform-back
3 chunks in bimdata/platform-back


In [51]:
# Retrieve relevant information using similarity search
retriever = vector_store.as_retriever() # uses similarity search by default

In [52]:
# Q/A Retrieval Chain

from langchain_openai import AzureChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Initialize the Azure AI LLM with the specific model
llm = AzureChatOpenAI(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="gpt-4o-mini",
    openai_api_version="2025-01-01-preview",
)

# Define the system prompt that instructs the LLM how to answer questions based on retrieved context
system_prompt = (
    "You are a specialist of Building Information modeling looking for a web-based building model viewer"    
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "Add sources like links to your answer."
    "If you don't know the answer, say that you don't know."
    "Use four sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"  # Placeholder for the retrieved context
)

# Create a chat prompt template with a system message and human message
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),  # System message contains instructions and context
        ("human", "{input}"),  # Human message includes the user's input question
    ]
)

# Create a document chain that combines the LLM with the prompt
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Combine the retrieval chain with the question-answering chain 
# The retrieval chain retrieves relevant documents and feeds them into the question-answering chain
retrieval_chain = create_retrieval_chain(retriever, question_answer_chain)

In [53]:
# Invoke the chain with a question, and the retriever will provide context for the LLM to generate an answer
response = retrieval_chain.invoke({"input": "Please describe the software editor behind this solution (name, description, history, "})
print(response['answer'])

I don't have specific information about the software editor behind the BIM viewer solution mentioned. The retrieved context primarily focuses on features and functionalities of the viewer rather than providing details about its development or history. Therefore, I'm unable to provide the name, description, or history of the software editor.


In [54]:
response = retrieval_chain.invoke({"input": "What is the price or licencing model of the solution?"})
print(response['answer'])

The licensing model for the BIMData solution includes several tiers: the Starter plan is free (0€ per year), the Professional plan starts at 540€ per year, and the Enterprise plan begins at 2500€ per year. For the on-premise option, pricing is provided upon request. Each plan comes with different storage capacities and features tailored to various project sizes and needs. More details can be found on their website: [BIMData Pricing](https://bimdata.com).


In [55]:
response = retrieval_chain.invoke({"input": "Can you list all the input format accepted by the solution?"})
print(response['answer'])

The solution accepts a variety of input formats including number, password, range, search, telephone, text, time, URL, week, select dropdowns, and textarea. Additionally, it supports date, datetime, datetime-local, email, and month inputs. For each format, there are also read-only and disabled states. Unfortunately, specific solutions available for Building Information Modeling were not outlined in the retrieved context.


In [56]:
response = retrieval_chain.invoke({"input": "What is the format used in the web viewer ?"})
print(response['answer'])

The specific format used in a web-based building model viewer typically includes standards like IFC (Industry Foundation Classes) or other formats like COLLADA, FBX, or proprietary formats optimized for visualization. However, the exact format may vary depending on the viewer's implementation. For more detailed information, it would be best to refer to the documentation of the specific viewer SDK you are using. Unfortunately, I don't have the specific information on the format used in the web viewer without additional context.


In [57]:
response = retrieval_chain.invoke({"input": "What is the WexBIM data format?"})
print(response['answer'])

The WexBIM data format is a proprietary format developed for exchanging building information models (BIM) with a focus on interoperability. It is designed to facilitate the sharing and collaboration of BIM data among various software applications and stakeholders in the construction industry. For more detailed information, you might refer to the official documentation or respective resources related to WexBIM. Unfortunately, I don't have specific links to provide.


In [58]:
response = retrieval_chain.invoke({"input": "What are the main programming language used in the solution?"})
print(response['answer'])

The retrieved context does not specify the main programming languages used in the BIMData.io solution. You may need to contact their support or refer to their documentation for detailed technical information. If you need further assistance, please let me know!


In [59]:
response = retrieval_chain.invoke({"input": "What are the licences for BIMData?"})
print(response['answer'])

I don't have specific information on the licenses for BIMData. You may need to check their official website or contact them directly for detailed licensing information. You can visit [BIMData](https://bimdata.io) for more resources.
