In [25]:
# https://medium.com/@aminajavaid30/building-a-rag-system-synthesis-67f36efa7c35

# Data Ingestion & Retrieval
import bs4
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_loaders import GithubFileLoader
from bs4 import BeautifulSoup



In [None]:
# Load a github repo 
if not os.environ.get("GITHUB_PAT"):
  os.environ["GITHUB_PAT"] = getpass.getpass("Enter Github PAT: ")

async def github_repo(repo_name: str) -> list[Document]:
    loader = GithubFileLoader(
        repo=repo_name, # the repo name
        access_token=os.environ["GITHUB_PAT"],
        github_api_url="https://api.github.com",
        file_filter=lambda file_path: file_path.endswith(
            ".md"
        ),  # load all markdowns files.
    )
    github_docs = []
    async for doc in loader.alazy_load():
        github_docs.append(doc)
    return github_docs

githubDoc = await github_repo("specklesystems/speckle-server")
print(len(githubDoc))

28


In [28]:
# Load a website reccusivly and split it into chunks
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

async def website_parser(base_url: str) -> list[Document]:
    loader = RecursiveUrlLoader(
        base_url,
        extractor=bs4_extractor,
        max_depth=2,
        prevent_outside=True,
        # use_async=False,
        # extractor=None,
        # metadata_extractor=None,
        # exclude_dirs=(),
        # timeout=10,
        # check_response_status=True,
        # continue_on_failure=True,
        # 
        # base_url=None,
        # ...
    )
    site_docs = []
    async for doc in loader.alazy_load():
        site_docs.append(doc)
    return site_docs

website_docs = await website_parser("https://www.speckle.systems/")
print(len(website_docs))

23


In [48]:
help_docs = await website_parser("https://speckle.guide/")
print(len(help_docs))

33


In [49]:
# Initialize the recursive character text splitter
text_splitter = RecursiveCharacterTextSplitter(              
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# Split the documents into chunks
documents = website_docs + githubDoc + help_docs
chunks = text_splitter.split_documents(documents)
print(len(documents))
print(len(chunks))

84
934


In [50]:
# Initialize the OpenAI embedding model
import getpass
import os

from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter API key for Azure: ")

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="text-embedding-3-small",
    openai_api_version="2025-01-01-preview",
)

# Store embeddings into the vector store
vector_store = InMemoryVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings
)

# Retrieve relevant information using similarity search
retriever = vector_store.as_retriever() # uses similarity search by default

Enter API key for Azure:  ········


In [51]:
# Q/A Retrieval Chain

from langchain_openai import AzureChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Initialize the Azure AI LLM with the specific model
llm = AzureChatOpenAI(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="gpt-4o-mini",
    openai_api_version="2025-01-01-preview",
)

# Define the system prompt that instructs the LLM how to answer questions based on retrieved context
system_prompt = (
    "You are a specialist of Building Information modeling looking for a web-based building model viewer"    
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "Add sources like links to your answer."
    "If you don't know the answer, say that you don't know."
    "Use four sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"  # Placeholder for the retrieved context
)

# Create a chat prompt template with a system message and human message
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),  # System message contains instructions and context
        ("human", "{input}"),  # Human message includes the user's input question
    ]
)

# Create a document chain that combines the LLM with the prompt
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Combine the retrieval chain with the question-answering chain 
# The retrieval chain retrieves relevant documents and feeds them into the question-answering chain
retrieval_chain = create_retrieval_chain(retriever, question_answer_chain)

In [52]:
# Invoke the chain with a question, and the retriever will provide context for the LLM to generate an answer
response = retrieval_chain.invoke({"input": "Please describe the software editor behind this solution (name, description, history, "})
print(response['answer'])

The software editor behind this solution is Speckle. Speckle is a cloud-based platform designed to enhance collaboration and automation in Architecture, Engineering, and Construction (AEC) projects. It allows users to connect various applications and automate processes to boost productivity and ensure real-time updates in building models. While the detailed history of Speckle isn't provided in the retrieved context, it has become significant in the AEC industry for its ability to streamline workflows and improve data flexibility. For more information, you can visit their official website at [Speckle](https://speckle.systems).


In [53]:
response = retrieval_chain.invoke({"input": "What is the price or licencing model of the solution?"})
print(response['answer'])

The pricing model for Autodesk's solutions includes a token expenditure system where converting models costs a certain number of tokens, with 10 tokens estimated to cost approximately $30, and options for monthly plans, such as $450 for converting 20 models 15 days a month. On the other hand, Speckle offers a free, open-source platform option with distinctions between personal, professional, and educational use cases, and provides free unlimited server storage for educators and students. Further specific pricing details will depend on the exact product and usage needs. You can find more information directly on their respective websites, such as Autodesk and Speckle.


In [54]:
response = retrieval_chain.invoke({"input": "Can you list all the input format accepted by the solution?"})
print(response['answer'])

The supported input formats include Blender, Unity, ETABS, SAP2000, CSiBridge, SAFE, SketchUp, MicroStation, OpenRoads, OpenRail, OpenBuildings, Tekla Structures, Archicad, and Navisworks. Each of these applications has specific limitations regarding object conversions that are outlined in the Speckle documentation. For the most up-to-date details, you can refer to the Speckle user guide linked here: [Speckle Documentation](https://speckle.guide).


In [55]:
response = retrieval_chain.invoke({"input": "What is the threejs extension?"})
print(response['answer'])

The threejs extension for Speckle allows you to display 3D data in a web browser using the Speckle viewer, which is built on top of Three.js. This extension enables rapid rendering of large 3D models directly in your browser, making it an effective tool for visualizing building information models. You can find more details and access the package on its [npm page](https://www.npmjs.com/package/@speckle/viewer).


In [56]:
response = retrieval_chain.invoke({"input": "What are the main programming language used in the solution?"})
print(response['answer'])

The provided context does not specify the main programming languages used in the solution. Therefore, I can't provide an answer regarding the programming languages used.


In [58]:
response = retrieval_chain.invoke({"input": "What are the licences for Speckle?"})
print(response['answer'])

Speckle is licensed under the Apache License 2.0. This license allows users to study, modify, redistribute, and commercialize any parts of the software. As a result, you can deploy your own Speckle server independently, without relying on the original creators. For more information, you can check their official documentation.
