In [1]:
# https://medium.com/@aminajavaid30/building-a-rag-system-synthesis-67f36efa7c35

# Data Ingestion & Retrieval
import bs4
import re
import os
import getpass
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_loaders import GithubFileLoader
from bs4 import BeautifulSoup



In [9]:
# Load a website reccusivly and split it into chunks
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

async def website_parser(base_url: str) -> list[Document]:
    loader = RecursiveUrlLoader(
        base_url,
        extractor=bs4_extractor,
        max_depth=2,
        prevent_outside=True,
        # use_async=False,
        # extractor=None,
        # metadata_extractor=None,
        # exclude_dirs=(),
        # timeout=10,
        # check_response_status=True,
        # continue_on_failure=True,
        # 
        # base_url=None,
        # ...
    )
    site_docs = []
    async for doc in loader.alazy_load():
        site_docs.append(doc)
    return site_docs

In [10]:
# Initialize the OpenAI embedding model
import getpass
import os

from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore, VectorStoreRetriever

if not os.environ.get("AZURE_OPENAI_API_KEY"):
  os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter API key for Azure: ")

azureEmbeddings = AzureOpenAIEmbeddings(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="text-embedding-3-small",
    openai_api_version="2025-01-01-preview",
)

# Initialize the recursive character text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)



In [11]:

# Define a function to create a vector store from an url
async def vector_store_create(url: str) -> VectorStoreRetriever:
    # Retrive the documents
    website_docs = await website_parser(url)
    website_docs
    print(f"{len(website_docs)} docs in {url}")
    # Split the documents into chunks
    chunks = text_splitter.split_documents(website_docs)
    print(f"{len(chunks)} chunks in {url}")
    # Initialize with an embedding model
    vector_store = InMemoryVectorStore(embedding=azureEmbeddings)
    # Add embeddings into the vector store
    vector_store.add_documents(documents=chunks)
    retriever = vector_store.as_retriever() # uses similarity search by default
    return retriever


In [12]:
# Q/A Retrieval Chain

from langchain_openai import AzureChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Initialize the Azure AI LLM with the specific model
llm = AzureChatOpenAI(
    azure_endpoint="https://oai-bim42-test-fr-ai.openai.azure.com",
    azure_deployment="gpt-4o-mini",
    openai_api_version="2025-01-01-preview",
)

# Define the system prompt that instructs the LLM how to answer questions based on retrieved context
system_prompt = (
    "Vous êtes un spécialiste de la gestion comptable des entreprises"    
    "Dans le cadre de la généralisation de la facturation électronique au 1er septembre 2026"
    "qui obligera pour les entreprises établies en France d'émettre et de recevoir des factures électroniques, "
    "tu étudies les différentes plateformes de dématérialisation partenaire (PDP)."
    "Tu utilise le contexte ci-dessous pour répondre aux questions :"
    "\n\n"
    "{context}"  # Placeholder for the retrieved context
    "\n\n"
    "Tu ajoutes systématiqument les sources à tes réponses."
    "Si tu ne connais pas la réponse, réponds je ne sais pas."
    "Tes réponses doivent être simples, courtes et de quatres phrases au maximum."
    "\n\n"
)

# Create a chat prompt template with a system message and human message
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),  # System message contains instructions and context
        ("human", "{input}"),  # Human message includes the user's input question
    ]
)

# Create a document chain that combines the LLM with the prompt
question_answer_chain = create_stuff_documents_chain(llm, prompt)


In [14]:

# create a retriver from an url
retriever = await vector_store_create("https://www.avalara.com/eu/fr")

# Combine the retrieval chain with the question-answering chain 
# The retrieval chain retrieves relevant documents and feeds them into the question-answering chain
retrieval_chain = create_retrieval_chain(retriever, question_answer_chain)

8 docs in https://www.avalara.com/eu/fr
71 chunks in https://www.avalara.com/eu/fr


In [15]:
# Invoke the chain with a question, and the retriever will provide context for the LLM to generate an answer
response = retrieval_chain.invoke({"input": "Est-ce que l'éditeur de logiciel propose bien une plateforme de dématérialisation partenaire (PDP) pour l'échange des factures ?"})
print(response['answer'])

Oui, Avalara propose une plateforme de dématérialisation partenaire (PDP) qui facilite l'échange de factures électroniques. Cette solution permet d'automatiser la création, la transmission et l'archivage des factures tout en assurant la conformité avec les réglementations nationales. De plus, elle gère des échanges via des réseaux comme Peppol. Cela optimise également les processus financiers des entreprises. 

(Source : contexte fourni)
