### 1. Base de documents

In [13]:
documents = [
    "La capitale du Mali est Bamako.",
    "Le deep learning est une branche du machine learning.",
    "Odoo est un ERP open-source écrit en Python.",
    "Le fleuve Niger traverse le Mali."
]

query = "Quelle est la capitale du Mali ?"


### 2. Bi-encoder (dense retrieval)

In [14]:
from sentence_transformers import SentenceTransformer, util

bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

doc_embeddings = bi_encoder.encode(documents, convert_to_tensor=True)
query_embedding = bi_encoder.encode(query, convert_to_tensor=True)

hits = util.semantic_search(query_embedding, doc_embeddings, top_k=3)[0]

print("\n=== Résultats Bi-encoder ===")
for hit in hits:
    print(f"{documents[hit['corpus_id']]} | Score: {hit['score']:.4f}")



=== Résultats Bi-encoder ===
La capitale du Mali est Bamako. | Score: 0.8888
Le fleuve Niger traverse le Mali. | Score: 0.4748
Le deep learning est une branche du machine learning. | Score: 0.2293


### 3. Cross-encoder (re-ranking)

In [15]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
cross_inp = [[query, documents[hit['corpus_id']]] for hit in hits]
cross_scores = cross_encoder.predict(cross_inp)

reranked = sorted(zip(cross_scores, cross_inp), key=lambda x: x[0], reverse=True)

print("\n=== Résultats Cross-encoder Re-ranker ===")
for score, pair in reranked:
    print(f"{pair[1]} | Score: {score:.4f}")


=== Résultats Cross-encoder Re-ranker ===
La capitale du Mali est Bamako. | Score: 6.1318
Le fleuve Niger traverse le Mali. | Score: -5.1832
Le deep learning est une branche du machine learning. | Score: -11.0533


### 4. Hybrid search (BM25 + dense)

In [16]:
from rank_bm25 import BM25Okapi

# Tokenisation simple (minuscule + split par espaces)
tokenized_corpus = [doc.lower().split() for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)

# Scores BM25 pour la query
bm25_scores = bm25.get_scores(query.lower().split())

print("\n=== BM25 scores ===")
for i in range(len(documents)):
    print(f"{documents[i]} | Tokens: {tokenized_corpus[i]} | Score: {bm25_scores[i]:.4f}")



=== BM25 scores ===
La capitale du Mali est Bamako. | Tokens: ['la', 'capitale', 'du', 'mali', 'est', 'bamako.'] | Score: 2.9471
Le deep learning est une branche du machine learning. | Tokens: ['le', 'deep', 'learning', 'est', 'une', 'branche', 'du', 'machine', 'learning.'] | Score: 0.1592
Odoo est un ERP open-source écrit en Python. | Tokens: ['odoo', 'est', 'un', 'erp', 'open-source', 'écrit', 'en', 'python.'] | Score: 0.1687
Le fleuve Niger traverse le Mali. | Tokens: ['le', 'fleuve', 'niger', 'traverse', 'le', 'mali.'] | Score: 0.0000


##### Fusion BM25 + bi-encoder (score normalisé simple)

In [17]:
import numpy as np

dense_scores = np.array([util.cos_sim(query_embedding, doc_embeddings[i]).item() for i in range(len(documents))])
bm25_scores = np.array(bm25_scores)

hybrid_scores = 0.5 * (dense_scores / dense_scores.max()) + 0.5 * (bm25_scores / bm25_scores.max())

sorted_docs = sorted(zip(hybrid_scores, documents), key=lambda x: x[0], reverse=True)

print("\n=== Résultats Hybrid (BM25 + Dense) ===")
for score, doc in sorted_docs:
    print(f"{doc} | Score: {score:.4f}")


=== Résultats Hybrid (BM25 + Dense) ===
La capitale du Mali est Bamako. | Score: 1.0000
Le fleuve Niger traverse le Mali. | Score: 0.2671
Le deep learning est une branche du machine learning. | Score: 0.1560
Odoo est un ERP open-source écrit en Python. | Score: 0.0879
