In [46]:
# https://medium.com/@aminajavaid30/building-a-rag-system-synthesis-67f36efa7c35

# Data Ingestion & Retrieval
import bs4
import re
import os
import getpass
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_loaders import GithubFileLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from bs4 import BeautifulSoup
from dataclasses import dataclass


In [18]:
# Initialize the recursive character text splitter
text_splitter = RecursiveCharacterTextSplitter(              
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

In [19]:
if not os.environ.get("AZURE_OPENAI_API_KEY"):
  os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter API key for Azure: ")

azureEmbeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="text-embedding-3-small",
    openai_api_version="2025-01-01-preview",
)

In [None]:
# Load a github repo

if not os.environ.get("GITHUB_PAT"):
  os.environ["GITHUB_PAT"] = getpass.getpass("Enter Github PAT: ")



In [None]:
@dataclass
class WebViewer:
    name: str
    repos: list[str]
    websites: list[str]

    async def github_repo(self) -> list[Document]:
        github_docs = []
        for repo_name in self.repos:
            loader = GithubFileLoader(
                repo=repo_name, # the repo name
                access_token= os.environ["GITHUB_PAT"],
                github_api_url="https://api.github.com",
                file_filter=lambda file_path: file_path.endswith(
                    ".md"
                ),  # load all markdowns files.
            )   
            async for doc in loader.alazy_load():
                github_docs.append(doc)
        return github_docs

In [50]:
# Define a solution to parse
viewer = WebViewer(
    name='ThatOpen',
    repos=["ThatOpen/engine_web-ifc", "ThatOpen/engine_components","ThatOpen/engine_ui-components"],
    websites =["ThatOpen/engine_web-ifc", "ThatOpen/engine_components","ThatOpen/engine_ui-components"]
    )

docs = []
all_chunks = []

# Retrive the documents
github_docs = await viewer.github_repo()
print(f"{len(github_docs)} docs in {repo}")
docs = docs + github_docs
# Split the documents into chunks
chunks = text_splitter.split_documents(github_docs)
for chunk in chunks:
    metadata={
        'path':chunk.metadata['path'],
        'sha':chunk.metadata['sha'],
        'source':chunk.metadata['source'],
        'start_index':chunk.metadata['start_index'],
        'chunk_id':chunk.metadata['source'] + str(chunk.metadata['start_index'])}
    new_chunk = Document(page_content= chunk.page_content, metadata = metadata)
    all_chunks.append(new_chunk)
print(f"{len(chunks)} chunks in {repo}")

11 docs in ThatOpen/engine_ui-components
158 chunks in ThatOpen/engine_ui-components


In [51]:
print(all_chunks[1])

page_content='1.5. "Incompatible With Secondary Licenses"
    means

    (a) that the initial Contributor has attached the notice described
        in Exhibit B to the Covered Software; or

    (b) that the Covered Software was made available under the terms of
        version 1.1 or earlier of the License, but not also under the
        terms of a Secondary License.

1.6. "Executable Form"
    means any form of the work other than Source Code Form.

1.7. "Larger Work"
    means a work that combines Covered Software with other material, in
    a separate file or files, that is not Covered Software.

1.8. "License"
    means this document.

1.9. "Licensable"
    means having the right to grant, to the maximum extent possible,
    whether at the time of the initial grant or subsequently, any and
    all of the rights conveyed by this License.

1.10. "Modifications"
    means any of the following:' metadata={'path': 'LICENSE.md', 'sha': 'a612ad9813b006ce81d1ee438dd784da99a54007', 'source'

In [22]:
# Initialize FAISS vector store
index = faiss.IndexFlatL2(len(azureEmbeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=azureEmbeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [None]:
# Add documents to the vector store

# Create a list of ids
ids = [chunk.metadata['chunk_id'] for chunk in all_chunks]

vector_store.add_documents(documents=all_chunks, ids=ids)


https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md0


['https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md0',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md751',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md1591',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md2540',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md3169',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md4033',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md4955',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md5616',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md6224',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md6944',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md7621',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md7971',
 'https://api.github.com/ThatOpen/engine_web-ifc/blob/main/LICENSE.md8657',
 'https://api.gi

In [44]:
results = vector_store.similarity_search(
    "web-ifc is available here",
    k=2,
    filter={"path": "README.md"}
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* <p align="center">
  <a href="https://thatopen.com/">TOC</a>
  |
  <a href="https://thatopen.github.io/engine_web-ifc/docs">web-ifc documentation</a>
  |
  <a href="https://docs.thatopen.com/intro"> platform documentation</a>
  |
  <a href="https://thatopen.github.io/engine_web-ifc/demo">demo</a>
  |
  <a href="https://people.thatopen.com/">community</a>
  |
  <a href="https://www.npmjs.com/package/web-ifc">npm package</a>
</p>

![cover](banner.png)

<h1>Web IFC <img src="https://thatopen.github.io/engine_components/resources/favicon.ico" width="32"/></h1>

[![NPM Package][npm]][npm-url]
[![NPM Package][npm-downloads]][npm-url]

**web-ifc** is a javascript library to read and write ifc files, at native speeds. **web-ifc** is part of the [That Open Company](https://thatopen.com) project, which aims to lower the threshold for developing open BIM applications.

## Install

`npm install web-ifc`

## Quick setup

```JavaScript
const WebIFC = require("web-ifc/web-ifc-api.js"); [{'path': 'R