In [5]:
# https://medium.com/@aminajavaid30/building-a-rag-system-synthesis-67f36efa7c35

# Data Ingestion & Retrieval
import bs4
import re
import os
import getpass
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_loaders import GithubFileLoader
from bs4 import BeautifulSoup



In [11]:
# Load a github repo

if not os.environ.get("GITHUB_PAT"):
  os.environ["GITHUB_PAT"] = getpass.getpass("Enter Github PAT: ")

async def github_repo(repo_name: str) -> list[Document]:
    loader = GithubFileLoader(
        repo=repo_name, # the repo name
        access_token= os.environ["GITHUB_PAT"],
        github_api_url="https://api.github.com",
        file_filter=lambda file_path: file_path.endswith(
            ".md"
        ),  # load all markdowns files.
    )
    github_docs = []
    async for doc in loader.alazy_load():
        github_docs.append(doc)
    return github_docs

In [7]:
# Load a website reccusivly and split it into chunks
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

async def website_parser(base_url: str) -> list[Document]:
    loader = RecursiveUrlLoader(
        base_url,
        extractor=bs4_extractor,
        max_depth=2,
        prevent_outside=True,
        # use_async=False,
        # extractor=None,
        # metadata_extractor=None,
        # exclude_dirs=(),
        # timeout=10,
        # check_response_status=True,
        # continue_on_failure=True,
        # 
        # base_url=None,
        # ...
    )
    site_docs = []
    async for doc in loader.alazy_load():
        site_docs.append(doc)
    return site_docs

In [8]:
# Initialize the OpenAI embedding model
import getpass
import os

from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

if not os.environ.get("AZURE_OPENAI_API_KEY"):
  os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter API key for Azure: ")

azureEmbeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="text-embedding-3-small",
    openai_api_version="2025-01-01-preview",
)

# Initialize the recursive character text splitter
text_splitter = RecursiveCharacterTextSplitter(              
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# Initialize with an embedding model
vector_store = InMemoryVectorStore(embedding=azureEmbeddings)

In [9]:
# Define a list of site to parse
urls = ["https://speckle.guide/","https://www.speckle.systems/"]

# Iterate over each urls in the list
for url in urls:
    # Retrive the documents
    website_docs = await website_parser(url)
    website_docs
    print(f"{len(website_docs)} docs in {url}")
    # Split the documents into chunks
    chunks = text_splitter.split_documents(website_docs)
    print(f"{len(chunks)} chunks in {url}")
    # Add embeddings into the vector store
    vector_store.add_documents(documents=chunks)


33 docs in https://speckle.guide/
398 chunks in https://speckle.guide/
23 docs in https://www.speckle.systems/
441 chunks in https://www.speckle.systems/


In [12]:
# Define a list of github repo to parse
repos = ["specklesystems/speckle-server"]

# Iterate over each urls in the list
for repo in repos:
    # Retrive the documents
    github_docs = await github_repo(repo)
    print(f"{len(github_docs)} docs in {repo}")
    # Split the documents into chunks
    chunks = text_splitter.split_documents(github_docs)
    print(f"{len(chunks)} chunks in {repo}")
    # Add embeddings into the vector store
    vector_store.add_documents(documents=chunks)

28 docs in specklesystems/speckle-server
95 chunks in specklesystems/speckle-server


In [13]:
# Retrieve relevant information using similarity search
retriever = vector_store.as_retriever() # uses similarity search by default

In [14]:
# Q/A Retrieval Chain

from langchain_openai import AzureChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Initialize the Azure AI LLM with the specific model
llm = AzureChatOpenAI(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="gpt-4o-mini",
    openai_api_version="2025-01-01-preview",
)

# Define the system prompt that instructs the LLM how to answer questions based on retrieved context
system_prompt = (
    "You are a specialist of Building Information modeling looking for a web-based building model viewer"    
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "Add sources like links to your answer."
    "If you don't know the answer, say that you don't know."
    "Use four sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"  # Placeholder for the retrieved context
)

# Create a chat prompt template with a system message and human message
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),  # System message contains instructions and context
        ("human", "{input}"),  # Human message includes the user's input question
    ]
)

# Create a document chain that combines the LLM with the prompt
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Combine the retrieval chain with the question-answering chain 
# The retrieval chain retrieves relevant documents and feeds them into the question-answering chain
retrieval_chain = create_retrieval_chain(retriever, question_answer_chain)

In [15]:
# Invoke the chain with a question, and the retriever will provide context for the LLM to generate an answer
response = retrieval_chain.invoke({"input": "Please describe the software editor behind this solution (name, description, history, "})
print(response['answer'])

The software editor behind this solution is Speckle. Speckle is a cloud-based platform designed to facilitate collaboration and data management in architecture, engineering, and construction (AEC) projects. It allows teams to automate processes, capture real-time updates, and integrate with a variety of other applications through its support for over 25 plugins. Speckle has gained recognition for enhancing collaboration and decision-making in complex design projects within the AEC sector.


In [16]:
response = retrieval_chain.invoke({"input": "What is the price or licencing model of the solution?"})
print(response['answer'])

The pricing and licensing for Autodesk services typically involve a token expenditure model, where you might pay approximately $30 for converting models, like 20 Revit models, which can accumulate to around $450 per month if used frequently. Additionally, fees are applicable for usage exceeding agreed limits and are invoiced for payment within 30 days. In contrast, Speckle offers free options for educational use and provides tiered pricing for personal and professional use. For precise pricing details, please refer to the official websites of these services.


In [17]:
response = retrieval_chain.invoke({"input": "Can you list all the input format accepted by the solution?"})
print(response['answer'])

The input formats accepted by the Speckle solution include various building modeling applications as follows: Blender, Unity, ETABS, SAP2000, CSiBridge, SAFE, SketchUp, MicroStation, OpenRoads, OpenRail, OpenBuildings, Tekla Structures, Archicad, and Navisworks. Each application may have different levels of support for object conversions. For specific details about unsupported elements and limitations, you may refer to the official documentation.


In [18]:
response = retrieval_chain.invoke({"input": "What is the format used in the web viewer ?"})
print(response['answer'])

The web viewer, specifically the Speckle viewer, primarily uses a custom format for 3D models that is efficient for rendering and manipulation in web browsers. This format leverages a data tree structure specific to Speckle's API for handling complex model data. For more detailed information about the supported data structures, you can refer to the full documentation [here](https://speckle.guide/viewer/).


In [19]:
response = retrieval_chain.invoke({"input": "What is the WexBIM data format?"})
print(response['answer'])

I don't have information about the WexBIM data format.


In [20]:
response = retrieval_chain.invoke({"input": "What are the main programming language used in the solution?"})
print(response['answer'])

The main programming languages used in Speckle, a web-based building model viewer, include C#, JavaScript, and TypeScript. These languages facilitate the integration and customization of the platform through its API and user interface elements. Additionally, Speckle's ability to support over 25 plugins indicates a versatile use of various programming languages depending on the plugins developed. For more detailed information, you can visit [Speckle's website](https://speckle.systems/).


In [21]:
response = retrieval_chain.invoke({"input": "What are the licences for BIMData?"})
print(response['answer'])

I'm sorry, but I don't have information on the licenses for BIMData. You may need to consult the official BIMData website or their documentation for accurate licensing details.
