In [1]:
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from openai import OpenAI


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  return _bootstrap._gcd_import(name[level:], package, level)


In [2]:
def get_embeddings_file_name(tool="llamaparse", company_name="LOREAL_2023", chunking_method_name="chunk-pages",embedding_model_name="text-embedding-3-large"):
    return f"04-embeddings/{company_name}_chroma_db_ocr-{tool}_{chunking_method_name}_{embedding_model_name}"

    

def get_db_from_file(file, embeddings_model):
    db = Chroma(persist_directory=file, embedding_function=embeddings_model)
    return db


def get_all_docs_from_db(db):
    all_docs = db.get(include=["documents", "metadatas"])
    
    documents = all_docs["documents"]
    metadatas = all_docs["metadatas"]

    
    if "metadatas" in all_docs:
        if (all_docs["metadatas"] == [None] * len(all_docs["metadatas"])):
        
            
            doc_objects = [Document(page_content=doc, metadata={'type':"text"}) for doc in documents ]

        
        else:
            doc_objects=[]
            for doc, meta in zip(documents, metadatas):
                if meta:
                    doc_objects.append(Document(page_content=doc, metadata=meta))
                else:
                    doc_objects.append(Document(page_content=doc, metadata={'type':"text"}))
    else:
        doc_objects = [Document(page_content=doc, metadata={'type':"text"}) for doc in documents ]
    
    return doc_objects




def get_vector_retriever(query, retriever,k=8):
    vector_results = retriever.get_relevant_documents(query)
    
    unique_results = {doc.page_content: doc for doc in vector_results}
    
    return list(unique_results.values())

In [3]:
company_name="LOREAL_2023" 
model="llm2_gpt-4o"
tool="llm2_gpt-4o" 
chunking_method_name="chunk-pages" # "chunk-pages" "chunk-markdown" "chunk-recursive"
embedding_model_name="embeddings-multilingual-e5-large-instruct"

In [4]:
embeddings_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large-instruct")



In [5]:
top_k=8

In [6]:
emb_file_name=get_embeddings_file_name(
                                        company_name=company_name, 
                                     tool=tool, 
                                     chunking_method_name=chunking_method_name,
                                     embedding_model_name=embedding_model_name)

db = get_db_from_file(emb_file_name,embeddings_model=embeddings_model)

splits=get_all_docs_from_db(db)


vector_retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": top_k})


In [10]:
query_results_text

''

In [8]:
question = '''

Quelles sont les entités ou personnes qui composent l'actionnariat de LOREAL ?
'''
contexte=get_vector_retriever(question, vector_retriever,k=top_k)
query_results_text = "\n-- ".join([x.page_content for x in contexte])

client = OpenAI()
reponse=client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are an expert in gathering information from annual reports."},
            {"role": "user", "content": f"Answer diligently on this question {question} from the following texts of the report:"},
            {"role": "user", "content": f"{query_results_text}"},
            {"role": "user", "content": f"Be concise and provide the most relevant information from the texts only. Do not use the internet or general knowledge."},
        ]
        ).choices[0].message.content


print(f"Question: {question}")
print(f"\nRéponse lLLM: {reponse}")

  vector_results = retriever.get_relevant_documents(query)


Question: 

Quelles sont les entités ou personnes qui composent l'actionnariat de LOREAL ?


Réponse lLLM: Au 31 décembre 2023, l'actionnariat de L'Oréal se compose des entités et personnes suivantes :

- **34,73 %** : Mme Françoise Bettencourt Meyers et sa famille, comprenant MM. Jean-Pierre Meyers, Jean-Victor Meyers et Nicolas Meyers, ainsi que les sociétés Téthys SAS et Financière L’Arcouest SAS.
- **30,7 %** : Nestlé S.A.
- **20,13 %** : Institutionnels internationaux.
- **6,63 %** : Institutionnels français.
- **5,92 %** : Actionnaires individuels.
- **1,89 %** : Salariés (incluant les anciens salariés).

Ceux qui agissent de concert sont principalement la famille Bettencourt Meyers et Nestlé, qui ne sont plus en accord depuis le 21 mars 2018.
