In [1]:
import os
from dotenv import load_dotenv


def load_config():
    load_dotenv()
    config = {
        "CSV_INPUT_PATH": os.path.join("../datas", "gutenberg.csv"),
        "CSV_CLEANED_PATH": os.path.join("../datas", "gutenberg2.csv"),
        "EMBEDDINGS_FILE": os.path.join("../models", "embeddings.csv"),
        "DOCS_FILE": os.path.join("../models", "docs.csv"),
        "FAISS_INDEX": os.path.join("../models", "faiss_index.index"),
        "CHUNK_SIZE": int(500),
        "AZURE_OPENAI_API_KEY": os.getenv("AZURE_OPENAI_API_KEY"),
        "AZURE_OPENAI_API_BASE": os.getenv("AZURE_OPENAI_API_BASE"), 
        "AZURE_OPENAI_API_ENDPOINT": os.getenv("AZURE_OPENAI_API_ENDPOINT"),
        "AZURE_DEPLOYMENT_NAME": os.getenv("AZURE_DEPLOYMENT_NAME"),
        "AZURE_API_VERSION": os.getenv("AZURE_API_VERSION"),
        "AZURE_DEPLOYEMENT": os.getenv("AZURE_DEPLOYEMENT"),
        "LANGCHAIN_ENDPOINT": os.getenv("LANGCHAIN_ENDPOINT"),
        "LANGCHAIN_API_KEY": os.getenv("LANGCHAIN_API_KEY")
    }
    return config

In [3]:
from langchain_openai.embeddings import AzureOpenAIEmbeddings


config = load_config()
AZURE_OPENAI_API_BASE = config["AZURE_OPENAI_API_BASE"]
AZURE_OPENAI_API_KEY = config["AZURE_OPENAI_API_KEY"]
AZURE_DEPLOYEMENT = config["AZURE_DEPLOYEMENT"]
CHUNK_SIZE = config["CHUNK_SIZE"]

class EmbeddingModel:
    def __init__(self):
        self.embedding_model = AzureOpenAIEmbeddings(
            azure_endpoint=AZURE_OPENAI_API_BASE,
            openai_api_key=AZURE_OPENAI_API_KEY,
            azure_deployment=AZURE_DEPLOYEMENT,
            chunk_size=CHUNK_SIZE
        )
    def get_embedding_model(self):
        return self.embedding_model

In [None]:
print(EmbeddingModel().get_embedding_model())

In [5]:
import os
from langchain_openai import AzureChatOpenAI

config = load_config()
AZURE_OPENAI_API_ENDPOINT = config["AZURE_OPENAI_API_ENDPOINT"]
AZURE_DEPLOYMENT_NAME = config["AZURE_DEPLOYMENT_NAME"]
AZURE_OPENAI_API_KEY = config["AZURE_OPENAI_API_KEY"]
AZURE_API_VERSION = config["AZURE_API_VERSION"]

class LanguageModel:
    def __init__(self):
        self.llm = AzureChatOpenAI(
            azure_endpoint=AZURE_OPENAI_API_ENDPOINT,
            azure_deployment=AZURE_DEPLOYMENT_NAME,
            openai_api_key=AZURE_OPENAI_API_KEY,
            api_version=AZURE_API_VERSION,
            temperature=0.0
        )
    def get_language_model(self):
        return self.llm

In [None]:
print(LanguageModel().get_language_model())

In [7]:
import os
import pandas as pd
import logging
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import faiss

EMBEDDINGS_FILE = os.path.join("../models", "embeddings.csv")
FAISS_INDEX = os.path.join("../models", "faiss_index.index")
DOCS_FILE = os.path.join("../models", "docs.csv")

logger = logging.getLogger(__name__)

def save_embeddings(embeddings):
    df = pd.DataFrame(embeddings)
    df.to_csv(EMBEDDINGS_FILE, index=False, header=False)

def load_embeddings():
    """Charge le modèle d'embeddings depuis un fichier."""
    if os.path.exists(EMBEDDINGS_FILE):
        df = pd.read_csv(EMBEDDINGS_FILE, header=None)
        logger.info(f"Modèle d'embedding chargé depuis {EMBEDDINGS_FILE}.")
        return df.values.tolist()
    else:
        logger.warning("Fichier d'embeddings non trouvé.")
        return EmbeddingModel()

def save_vector_store(vector_store):
    faiss.write_index(vector_store.index, FAISS_INDEX)
    logger.info(f"Magasin de vecteurs sauvegardé dans {FAISS_INDEX}.")

def load_vector_store(embedding_model):
    if os.path.exists(FAISS_INDEX):
        index = faiss.read_index(FAISS_INDEX)
        docstore = InMemoryDocstore()
        index_to_docstore_id = {}
        vector_store = FAISS(
            embedding_function=embedding_model.embed_query,
            index=index,
            docstore=docstore,
            index_to_docstore_id=index_to_docstore_id
        )        
        logger.info(f"Magasin de vecteurs chargé depuis {FAISS_INDEX}.")
        return vector_store
    else:
        logger.warning("Répertoire de sauvegarde non trouvé.")
        return None

def save_docs(docs):
    """Sauvegarde les documents dans un fichier CSV."""
    df = pd.DataFrame(docs)
    df.to_csv(DOCS_FILE, index=False, header=['id', 'content'])
    logger.info(f"Documents sauvegardés dans {DOCS_FILE}.")

def load_docs():
    """Charge les documents depuis un fichier CSV."""
    if os.path.exists(DOCS_FILE):
        df = pd.read_csv(DOCS_FILE)  
        logger.info(f"Documents chargés depuis {DOCS_FILE}.")
        return df.to_dict(orient='records')
    else:
        logger.warning("Fichier de documents non trouvé.")
        return None

In [None]:
import logging
from uuid import uuid4
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import faiss

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_data(csv_path):
    loader = CSVLoader(file_path=csv_path, encoding='utf-8')
    return loader.load()

def split_documents(data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0)
    return text_splitter.split_documents(data)

def create_vector_store(all_splits, embedding_model):
    index = faiss.IndexFlatL2(len(embedding_model.embed_query(all_splits[0].page_content)))
    docstore = InMemoryDocstore()
    index_to_docstore_id = {}
    vectorstore = FAISS(
        embedding_function=embedding_model.embed_query, 
        index=index, 
        docstore=docstore, 
        index_to_docstore_id=index_to_docstore_id
    )
    total_docs = len(all_splits)
    embeddings = []
    docs = []
    for i, split in enumerate(all_splits):
        embedding = embedding_model.embed_query(split.page_content)
        embeddings.append(embedding)
        doc_id = str(uuid4())
        vectorstore.add_documents([split], ids=[doc_id])
        index_to_docstore_id[doc_id] = len(embeddings) - 1        
        docs.append({'id': doc_id, 'content': split.page_content})
        logger.info(f"{i + 1}/{total_docs}")
    save_embeddings(embeddings)
    save_vector_store(vectorstore)
    save_docs(docs)
    return vectorstore

def configure_qa_chain(llm_model, vectorstore):
    return RetrievalQA.from_chain_type(llm=llm_model, retriever=vectorstore.as_retriever())

def run_chat_loop(qa_chain, docs):
    while True:
        user_input = input("Votre question (ou tapez 'exit' pour quitter) : ")
        if user_input.lower() == 'exit':
            break
        if len(user_input.strip()) < 5:
            print("Veuillez poser une question plus détaillée.")
            continue
        context = ""
        if docs:
            context = "\n".join([doc['content'] for doc in docs])
        try:
            response = qa_chain.invoke({"query": user_input, "context": context})
            print("Réponse :", response.get('output', "Aucune réponse générée."))
        except Exception as e:
            print("Désolé, une erreur s'est produite lors du traitement de votre question.")

def main():
    global CSV_CLEANED_PATH, CHUNK_SIZE
    CSV_CLEANED_PATH = os.path.join("../datas", "gutenberg2.csv")
    CHUNK_SIZE = int(500)
    data = load_data(CSV_CLEANED_PATH)
    all_splits = split_documents(data)
    embeddings = load_embeddings()
    if isinstance(embeddings, list):
        logger.info("Embeddings chargés avec succès.")
        embedding_model = EmbeddingModel().get_embedding_model()
    else:
        logger.info("Aucun modèle d'embeddings trouvé, création d'un nouveau modèle.")
        embedding_model = embeddings
    vectorstore = load_vector_store(embedding_model)
    if vectorstore is None:
        logger.info("Aucun magasin de vecteurs trouvé, création d'un nouveau magasin.")
        vectorstore = create_vector_store(all_splits, embedding_model)
    else:
        logger.info("Magasin de vecteurs chargé avec succès.")
    docs = load_docs()
    if docs is None or len(docs) == 0:
        logger.warning("Aucun document trouvé. Certaines fonctionnalités peuvent ne pas être disponibles.")
    llm_model = LanguageModel().get_language_model()
    qa_chain = configure_qa_chain(llm_model, vectorstore)
    run_chat_loop(qa_chain, docs)

if __name__ == "__main__":
    main()
