### Imports & Config

In [1]:

# Imports des biblioth√®ques n√©cessaires
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import scipy.sparse as sp

# Cr√©ation directories
DATA_DIR = Path("data")
MODEL_DIR = Path("models")
RESULTS_DIR = Path("results")

for directory in [DATA_DIR / "raw", 
                  DATA_DIR / "processed", 
                  DATA_DIR / "relish",
                  MODEL_DIR, 
                  RESULTS_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

print("‚úì Structure cr√©√©e")

# Hyperparam√®tres globaux du projet :
# - TFIDF_MAX_FEATURES : taille maximale du vocabulaire TF-IDF
# - W2V_DIM : dimension des vecteurs Word2Vec
# - W2V_WINDOW : taille de la fen√™tre contextuelle Word2Vec
# - ALPHA_TFIDF / BETA_W2V : pond√©rations pour la fusion TF-IDF + Word2Vec
# - TOP_K : nombre de documents retourn√©s lors du retrievalTFIDF_MAX_FEATURES = 50000
W2V_DIM = 200
W2V_WINDOW = 5
ALPHA_TFIDF = 0.6
BETA_W2V = 0.4
TOP_K = 100

‚úì Structure cr√©√©e


### Data

Fichier 1 : Corpus (163K documents)   
wget https://zenodo.org/records/14801641/files/relish_documents.tsv  

Contient ~163 000 documents scientifiques.  
Chaque entr√©e correspond √† un article (texte, identifiant, m√©tadonn√©es) qui constitue l‚Äôespace de recherche du syst√®me de retrieval.
 
Fichier 2 : Ground truth (189K paires)    
wget https://zenodo.org/records/14801641/files/relevance_matrix.tsv

Ce fichier sert de r√©f√©rence d‚Äô√©valuation (gold standard) pour mesurer la qualit√© du syst√®me de retrieval (pr√©cision, rappel, ranking).


In [4]:
# Ce bloc sert √† s√©curiser le chargement des donn√©es RELISH avant toute analyse.
from pathlib import Path

DATA_DIR = Path("data")
RELISH_DIR = DATA_DIR / "relish"

# Cr√©ation structure
RELISH_DIR.mkdir(parents=True, exist_ok=True)

# V√©rification pr√©sence fichiers
corpus_file = RELISH_DIR / "relish_documents.tsv"
qrels_file = RELISH_DIR / "relevance_matrix.tsv"

if corpus_file.exists():
    size_mb = corpus_file.stat().st_size / (1024**2)
    print(f"‚úì relish_documents.tsv trouv√© ({size_mb:.1f} MB)")
else:
    print(f"    relish_documents.tsv manquant")
    print(f"   T√©l√©charge depuis : https://zenodo.org/records/14801641")
    print(f"   Place dans : {corpus_file}")

if qrels_file.exists():
    size_mb = qrels_file.stat().st_size / (1024**2)
    print(f"‚úì relevance_matrix.tsv trouv√© ({size_mb:.1f} MB)")
else:
    print(f"   relevance_matrix.tsv manquant")
    print(f"   T√©l√©charge depuis : https://zenodo.org/records/14801641")
    print(f"   Place dans : {qrels_file}")

‚úì relish_documents.tsv trouv√© (264.6 MB)
‚úì relevance_matrix.tsv trouv√© (4.5 MB)


In [5]:
# pr√©-traitement complet de RELISH : chargement, nettoyage, structuration

DATA_DIR = Path("data")
RELISH_DIR = DATA_DIR / "relish"

# V√©rification fichiers
corpus_file = RELISH_DIR / "relish_documents.tsv"
qrels_file = RELISH_DIR / "relevance_matrix.tsv"

if not corpus_file.exists():
    raise FileNotFoundError(
        f"Missing file : {corpus_file}\n"
        "Download from https://zenodo.org/records/14801641\n"
        "Place in data/relish/"
    )

if not qrels_file.exists():
    raise FileNotFoundError(
        f"Missing file  : {qrels_file}\n"
        "Download from https://zenodo.org/records/14801641\n"
        "Place in data/relish/"
    )

# CHARGEMENT CORPUS

print("Chargement corpus...")
corpus_df = pd.read_csv(corpus_file, sep='\t')

# Inspection structure
print(f"‚úì Corpus : {len(corpus_df)} documents")
print(f"Colonnes : {corpus_df.columns.tolist()}")

# Standardisation noms colonnes si n√©cessaire
if 'PMID' in corpus_df.columns:
    corpus_df.rename(columns={'PMID': 'pmid', 'Title': 'title', 'Abstract': 'abstract'}, 
                     inplace=True)

# Nettoyage
corpus_df = corpus_df.dropna(subset=['abstract'])  # Supprime docs sans abstract
corpus_df['pmid'] = corpus_df['pmid'].astype(str)

print(f"Apr√®s nettoyage : {len(corpus_df)} documents")
print(f"\nExemple :")
print(corpus_df.iloc[0][['pmid', 'title']])
print(f"Abstract (100 chars) : {corpus_df.iloc[0]['abstract'][:100]}...")

# CHARGEMENT QRELS


print("\nChargement ground truth...")
qrels_df = pd.read_csv(qrels_file, sep='\t')

print(f"‚úì Qrels : {len(qrels_df)} paires")
print(f"Colonnes : {qrels_df.columns.tolist()}")

# Standardisation
if 'PMID1' in qrels_df.columns:
    qrels_df.rename(columns={'PMID1': 'pmid1', 'PMID2': 'pmid2', 'Relevance': 'relevance'}, 
                    inplace=True)

qrels_df['pmid1'] = qrels_df['pmid1'].astype(str)
qrels_df['pmid2'] = qrels_df['pmid2'].astype(str)

# Distribution relevance
print("\nDistribution scores relevance :")
print(qrels_df['relevance'].value_counts().sort_index())
"""
Attendu :
0 (irrelevant) : ~55K paires
1 (partially relevant) : ~65K paires
2 (completely relevant) : ~68K paires
"""

# ============================================
# CR√âATION MAPPING QUERY -> RELEVANT DOCS
# ============================================

print("\nCr√©ation mapping query‚Üídocuments pertinents...")

query_relevant = {}
for _, row in qrels_df.iterrows():
    query_pmid = row['pmid1']
    doc_pmid = row['pmid2']
    relevance = row['relevance']
    
    if query_pmid not in query_relevant:
        query_relevant[query_pmid] = {'relevant': [], 'irrelevant': []}
    
    if relevance > 0:  # Partially ou completely relevant
        query_relevant[query_pmid]['relevant'].append({
            'pmid': doc_pmid,
            'score': relevance
        })
    else:
        query_relevant[query_pmid]['irrelevant'].append(doc_pmid)

print(f"‚úì {len(query_relevant)} queries uniques")

# Statistiques
n_relevant = [len(v['relevant']) for v in query_relevant.values()]
print(f"Moyenne docs pertinents/query : {np.mean(n_relevant):.1f}")
print(f"Min : {np.min(n_relevant)}, Max : {np.max(n_relevant)}")

# SAUVEGARDE FORMAT PICKLE

processed_dir = DATA_DIR / "processed"
processed_dir.mkdir(exist_ok=True)

corpus_df.to_pickle(processed_dir / "corpus.pkl")

import pickle
with open(processed_dir / "query_relevant.pkl", 'wb') as f:
    pickle.dump(query_relevant, f)

print(f"\n‚úì Donn√©es pr√©par√©es sauvegard√©es dans {processed_dir}")

Chargement corpus...
‚úì Corpus : 163189 documents
Colonnes : ['PMID', 'title', 'abstract']
Apr√®s nettoyage : 163189 documents

Exemple :
pmid                                              24013020
title    Potentially harmful elements (PHEs) in scalp h...
Name: 0, dtype: object
Abstract (100 chars) : Internationally publicized impacts upon human health associated with potentially harmful element (PH...

Chargement ground truth...
‚úì Qrels : 189634 paires
Colonnes : ['PMID1', 'PMID2', 'Relevance', 'Cosine Similarity']

Distribution scores relevance :
relevance
0    55749
1    65406
2    68479
Name: count, dtype: int64

Cr√©ation mapping query‚Üídocuments pertinents...
‚úì 3190 queries uniques
Moyenne docs pertinents/query : 42.0
Min : 1, Max : 60

‚úì Donn√©es pr√©par√©es sauvegard√©es dans data/processed


### Preprocessing

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

# Configuration tqdm pour pandas
tqdm.pandas()

# T√©l√©chargements NLTK
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """Tokenization + lemmatization + stopword removal"""
    if pd.isna(text):
        return []
    
    # Lowercase + suppression ponctuation/chiffres
    text = re.sub(r'[^a-z\s]', '', text.lower())
    
    # Tokenization
    tokens = text.split()
    
    # Lemmatization + stopwords + longueur minimale
    tokens = [lemmatizer.lemmatize(t) for t in tokens 
              if t not in stop_words and len(t) > 2]
    
    return tokens

# Application corpus avec progress bar
print("Preprocessing abstracts...")
corpus_df['tokens'] = corpus_df['abstract'].progress_apply(preprocess_text)
corpus_df['text_clean'] = corpus_df['tokens'].apply(lambda x: ' '.join(x))

# Statistiques
lengths = corpus_df['tokens'].apply(len)
print(f"\n‚úì Preprocessing termin√©")
print(f"Longueur moyenne : {lengths.mean():.0f} tokens")
print(f"Min : {lengths.min()}, Max : {lengths.max()}")

# Filtrage documents trop courts
min_length = 20
corpus_df = corpus_df[corpus_df['tokens'].apply(len) >= min_length]
print(f"\nApr√®s filtrage (‚â•{min_length} tokens) : {len(corpus_df)} documents")

# Sauvegarde
corpus_df.to_pickle(DATA_DIR / "processed/corpus_preprocessed.pkl")
print("‚úì Corpus preprocessed sauvegard√©")

Preprocessing abstracts...


  0%|          | 0/163189 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 163189/163189 [02:01<00:00, 1345.98it/s]



‚úì Preprocessing termin√©
Longueur moyenne : 138 tokens
Min : 3, Max : 1515

Apr√®s filtrage (‚â•20 tokens) : 162863 documents
‚úì Corpus preprocessed sauvegard√©


### Index TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
import pickle

print("Entra√Ænement TF-IDF...")

vectorizer = TfidfVectorizer(
    max_features=50000,      # Top 50K termes
    ngram_range=(1, 2),      # Unigrams + bigrams
    min_df=5,                # Ignore termes <5 docs
    max_df=0.8,              # Ignore termes >80% docs
    sublinear_tf=True        # Log scaling TF
)

tfidf_matrix = vectorizer.fit_transform(corpus_df['text_clean'])

print(f"\n‚úì TF-IDF shape : {tfidf_matrix.shape}")
print(f"Sparsity : {tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1]):.4f}")
print(f"Vocabulaire : {len(vectorizer.vocabulary_)} termes")

# Top termes par IDF
feature_names = vectorizer.get_feature_names_out()
idfs = vectorizer.idf_
top_idfs = np.argsort(idfs)[-20:]
print(f"\nTop 20 termes IDF (rares/sp√©cifiques) :")
for idx in top_idfs:
    print(f"  {feature_names[idx]} : {idfs[idx]:.2f}")

# Sauvegarde
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(exist_ok=True)

pickle.dump(vectorizer, open(MODEL_DIR / "tfidf_vectorizer.pkl", "wb"))
sp.save_npz(MODEL_DIR / "tfidf_matrix.npz", tfidf_matrix)
print(f"\n‚úì Mod√®les sauvegard√©s dans {MODEL_DIR}")

Entra√Ænement TF-IDF...

‚úì TF-IDF shape : (162863, 50000)
Sparsity : 0.0023
Vocabulaire : 50000 termes

Top 20 termes IDF (rares/sp√©cifiques) :
  oxt : 10.52
  hfmd : 10.52
  cisnats : 10.52
  alri : 10.52
  townsend : 10.52
  desferrioxamine : 10.52
  ucmscs : 10.52
  rtw : 10.52
  ffdm : 10.60
  apremilast : 10.60
  rfsh : 10.60
  fma : 10.70
  rsph : 10.70
  vmws : 10.70
  usuv : 10.80
  deferiprone : 10.80
  neph : 10.80
  irisin : 11.05
  tachinidae : 11.05
  icps : 11.21

‚úì Mod√®les sauvegard√©s dans models


### Entra√Ænement Word2Vec

In [10]:
from gensim.models import Word2Vec

print("Entra√Ænement Word2Vec...")
print(f"Corpus : {len(corpus_df)} documents")

sentences = corpus_df['tokens'].tolist()

# Hyperparam√®tres
W2V_DIM = 200
W2V_WINDOW = 5
W2V_MIN_COUNT = 10

w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=W2V_DIM,
    window=W2V_WINDOW,
    min_count=W2V_MIN_COUNT,
    workers=4,
    epochs=10,
    sg=1,              # Skip-gram (meilleur que CBOW pour petit corpus)
    negative=5,        # Negative sampling
    seed=42
)

print(f"\n‚úì Entra√Ænement termin√©")
print(f"Vocabulaire : {len(w2v_model.wv)} termes")

# Tests similarit√©s
test_terms = ['cancer', 'tumor', 'treatment', 'patient', 'cell']
print("\nExemples similarit√©s :")
for term in test_terms:
    if term in w2v_model.wv:
        similar = w2v_model.wv.most_similar(term, topn=5)
        print(f"\n{term} :")
        for word, score in similar:
            print(f"  {word} : {score:.3f}")

# Sauvegarde
w2v_model.save(str(MODEL_DIR / "word2vec.model"))
print(f"\n‚úì Word2Vec sauvegard√© dans {MODEL_DIR}")

Entra√Ænement Word2Vec...
Corpus : 162863 documents

‚úì Entra√Ænement termin√©
Vocabulaire : 56557 termes

Exemples similarit√©s :

cancer :
  colorectal : 0.728
  breast : 0.725
  crc : 0.684
  carcinoma : 0.677
  adenocarcinoma : 0.676

tumor :
  tumour : 0.891
  metastasis : 0.672
  tumoral : 0.634
  xenograft : 0.624
  glioma : 0.620

treatment :
  therapy : 0.694
  treating : 0.693
  treated : 0.650
  treat : 0.582
  polychemotherapy : 0.554

patient :
  enrolled : 0.628
  dialysisdependent : 0.618
  diagnosed : 0.613
  electively : 0.612
  subject : 0.584

cell :
  proliferation : 0.654
  cdpos : 0.643
  aldhpositive : 0.627
  mdambs : 0.602
  bmecs : 0.602

‚úì Word2Vec sauvegard√© dans models


### Pr√©-calcul embeddings documents

In [11]:
print("Calcul embeddings documents...")

doc_embeddings = np.zeros((len(corpus_df), W2V_DIM))

for i, tokens in tqdm(enumerate(corpus_df['tokens']), total=len(corpus_df)):
    # Vecteurs termes pr√©sents dans vocabulaire
    vecs = [w2v_model.wv[t] for t in tokens if t in w2v_model.wv]
    
    if vecs:
        # Moyenne vecteurs
        doc_embeddings[i] = np.mean(vecs, axis=0)
    # Sinon reste vecteur z√©ro

# Statistiques
non_zero = np.count_nonzero(doc_embeddings.sum(axis=1))
print(f"\n‚úì Embeddings calcul√©s")
print(f"Documents avec embedding : {non_zero}/{len(corpus_df)} ({100*non_zero/len(corpus_df):.1f}%)")

# Sauvegarde
np.save(MODEL_DIR / "doc_embeddings.npy", doc_embeddings)
print(f"‚úì Sauvegard√© dans {MODEL_DIR}/doc_embeddings.npy")

Calcul embeddings documents...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 162863/162863 [00:59<00:00, 2754.91it/s]



‚úì Embeddings calcul√©s
Documents avec embedding : 162863/162863 (100.0%)
‚úì Sauvegard√© dans models/doc_embeddings.npy


### Classe 

Cette cellule d√©finit et initialise le **retriever hybride**. Il combine TF-IDF et Word2Vec pour scorer les documents par similarit√© lexicale et s√©mantique, fusionne les scores, puis retourne les documents les plus pertinents pour une requ√™te donn√©e.


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

class HybridRetriever:
    def __init__(self, corpus, tfidf_vec, tfidf_mat, w2v_model, doc_embs, alpha=0.6, beta=0.4):
        self.corpus = corpus.reset_index(drop=True)
        self.vectorizer = tfidf_vec
        self.tfidf_matrix = tfidf_mat
        self.w2v = w2v_model
        self.doc_embeddings = doc_embs
        self.alpha = alpha
        self.beta = beta
    
    def retrieve(self, query_text, top_k=10, return_scores=False):
        """Retrieval hybride TF-IDF + Word2Vec"""
        
        # Preprocessing query
        query_tokens = preprocess_text(query_text)
        query_clean = ' '.join(query_tokens)
        
        # TF-IDF scoring
        query_vec_tfidf = self.vectorizer.transform([query_clean])
        tfidf_scores = cosine_similarity(query_vec_tfidf, self.tfidf_matrix).flatten()
        
        # Word2Vec scoring
        w2v_scores = self._compute_w2v_scores(query_tokens)
        
        # Normalisation [0,1] avant hybridation
        tfidf_scores = (tfidf_scores - tfidf_scores.min()) / (tfidf_scores.max() - tfidf_scores.min() + 1e-10)
        w2v_scores = (w2v_scores - w2v_scores.min()) / (w2v_scores.max() - w2v_scores.min() + 1e-10)
        
        # Hybridation
        final_scores = self.alpha * tfidf_scores + self.beta * w2v_scores
        
        # Top-K
        top_indices = np.argsort(final_scores)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            result = {
                "rank": len(results) + 1,
                "pmid": self.corpus.iloc[idx]['pmid'],
                "title": self.corpus.iloc[idx]['title'],
                "score": final_scores[idx],
            }
            if return_scores:
                result["tfidf_score"] = tfidf_scores[idx]
                result["w2v_score"] = w2v_scores[idx]
            results.append(result)
        
        return pd.DataFrame(results)
    
    def _compute_w2v_scores(self, query_tokens):
        """Calcul similarit√© Word2Vec query-documents"""
        # Embedding query
        query_vecs = [self.w2v.wv[t] for t in query_tokens if t in self.w2v.wv]
        
        if not query_vecs:
            return np.zeros(len(self.corpus))
        
        query_emb = np.mean(query_vecs, axis=0)
        
        # Similarit√© vectoris√©e avec tous documents
        scores = cosine_similarity([query_emb], self.doc_embeddings)[0]
        
        return scores

# Instanciation
print("Chargement mod√®les...")
corpus_df = pd.read_pickle(DATA_DIR / "processed/corpus_preprocessed.pkl")
vectorizer = pickle.load(open(MODEL_DIR / "tfidf_vectorizer.pkl", "rb"))
tfidf_matrix = sp.load_npz(MODEL_DIR / "tfidf_matrix.npz")
w2v_model = Word2Vec.load(str(MODEL_DIR / "word2vec.model"))
doc_embeddings = np.load(MODEL_DIR / "doc_embeddings.npy")

retriever = HybridRetriever(
    corpus_df, vectorizer, tfidf_matrix, w2v_model, doc_embeddings,
    alpha=0.6, beta=0.4
)

print("‚úì Retriever initialis√©")

Chargement mod√®les...
‚úì Retriever initialis√©


### Test rapide

In [13]:
# Cell 8 - Test interactif

# Exemple query : utilise abstract d'un document
test_pmid = corpus_df.iloc[0]['pmid']
test_query = corpus_df.iloc[0]['abstract'][:200]  # 200 premiers chars

print(f"Query test (PMID {test_pmid}) :")
print(test_query)
print("\n" + "="*80 + "\n")

results = retriever.retrieve(test_query, top_k=10, return_scores=True)
print(results[['rank', 'pmid', 'title', 'score', 'tfidf_score', 'w2v_score']])

Query test (PMID 24013020) :
Internationally publicized impacts upon human health associated with potentially harmful element (PHE) exposure have been reported amongst internally displaced populations (IDPs) in Mitrovica, Kosovo,


   rank      pmid                                              title  \
0     1  24013020  Potentially harmful elements (PHEs) in scalp h...   
1     2  28558285  Enhancing the interpretation of in vitro bioac...   
2     3  28162042  The Syrian public health and humanitarian cris...   
3     4  21813430  Prevalence of war-related mental health condit...   
4     5  27411086  Epidemiology of Substance Use among Forced Mig...   
5     6  29747641  The Syrian conflict: a case study of the chall...   
6     7  23503989  Oceans and Human Health (OHH): a European pers...   
7     8  26286804  COST action TD1407: network on technology-crit...   
8     9  29359236                  Role of Plastics on Human Health.   
9    10  25023995  Syria: health in a country un

### √âvaluation compl√®te

Cette cellule lance une **√©valuation automatique** du retriever avec le gold standard. Elle charge le mapping `query ‚Üí documents pertinents`, d√©finit une fonction qui calcule des m√©triques classiques (P@10, AP/MAP, NDCG@10, Recall@10), puis teste le syst√®me sur un sous-√©chantillon de requ√™tes. Pour chaque requ√™te, elle prend l‚Äôabstract comme texte de recherche, r√©cup√®re les 100 meilleurs documents, compare aux documents pertinents attendus, stocke les scores, affiche les moyennes globales et sauvegarde le tout dans un CSV.


In [14]:
# Chargement query mapping
with open(DATA_DIR / "processed/query_relevant.pkl", 'rb') as f:
    query_relevant = pickle.load(f)

def calculate_metrics(retrieved_pmids, relevant_pmids, k=10):
    """Calcul P@K, AP, NDCG@K"""
    retrieved_k = retrieved_pmids[:k]
    relevant_set = set(relevant_pmids)
    
    # Precision@K
    p_at_k = len(set(retrieved_k) & relevant_set) / k if k > 0 else 0
    
    # Average Precision
    ap = 0
    num_relevant = 0
    for i, pmid in enumerate(retrieved_pmids, 1):
        if pmid in relevant_set:
            num_relevant += 1
            ap += num_relevant / i
    ap = ap / len(relevant_pmids) if relevant_pmids else 0
    
    # NDCG@K
    dcg = sum([1 / np.log2(i + 2) for i, pmid in enumerate(retrieved_k) 
               if pmid in relevant_set])
    idcg = sum([1 / np.log2(i + 2) for i in range(min(len(relevant_pmids), k))])
    ndcg = dcg / idcg if idcg > 0 else 0
    
    # Recall@K
    recall = len(set(retrieved_k) & relevant_set) / len(relevant_pmids) if relevant_pmids else 0
    
    return {"P@10": p_at_k, "AP": ap, "NDCG@10": ndcg, "Recall@10": recall}

# Sous-√©chantillon queries (100 pour test rapide, puis augmenter)
sample_size = 100
sample_queries = list(query_relevant.keys())[:sample_size]

print(f"√âvaluation sur {len(sample_queries)} queries...")

all_results = []

for query_pmid in tqdm(sample_queries):
    # R√©cup√©ration document query
    query_doc = corpus_df[corpus_df['pmid'] == query_pmid]
    if query_doc.empty:
        continue
    
    query_text = query_doc.iloc[0]['abstract']
    
    # Retrieval
    try:
        retrieved = retriever.retrieve(query_text, top_k=100)
        retrieved_pmids = retrieved['pmid'].tolist()
    except:
        continue
    
    # Ground truth
    relevant_pmids = [d['pmid'] for d in query_relevant[query_pmid]['relevant']]
    
    if not relevant_pmids:
        continue
    
    # M√©triques
    metrics = calculate_metrics(retrieved_pmids, relevant_pmids, k=10)
    metrics['query_pmid'] = query_pmid
    metrics['n_relevant'] = len(relevant_pmids)
    
    all_results.append(metrics)

results_df = pd.DataFrame(all_results)

# R√©sultats globaux
print("\n" + "="*80)
print("R√âSULTATS GLOBAUX")
print("="*80)
print(f"MAP       : {results_df['AP'].mean():.4f}")
print(f"NDCG@10   : {results_df['NDCG@10'].mean():.4f}")
print(f"P@10      : {results_df['P@10'].mean():.4f}")
print(f"Recall@10 : {results_df['Recall@10'].mean():.4f}")

# Sauvegarde
results_df.to_csv(RESULTS_DIR / f"evaluation_{sample_size}queries.csv", index=False)
print(f"\n‚úì R√©sultats sauvegard√©s dans {RESULTS_DIR}")


√âvaluation sur 100 queries...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [01:30<00:00,  1.10it/s]


R√âSULTATS GLOBAUX
MAP       : 0.5430
NDCG@10   : 0.6064
P@10      : 0.6859
Recall@10 : 0.1752

‚úì R√©sultats sauvegard√©s dans results





### Interface chatbot interactive

In [16]:
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# Widgets interface
query_input = widgets.Textarea(
    placeholder='Enter your search query or paste an abstract...',
    description='Query:',
    layout=widgets.Layout(width='90%', height='100px')
)

top_k_slider = widgets.IntSlider(
    value=10, 
    min=5, 
    max=50, 
    step=5,
    description='Top-K:',
    layout=widgets.Layout(width='300px')
)

alpha_slider = widgets.FloatSlider(
    value=0.6,
    min=0.0,
    max=1.0,
    step=0.1,
    description='Œ± (TF-IDF):',
    layout=widgets.Layout(width='300px')
)

search_button = widgets.Button(
    description='üîç Search',
    button_style='primary',
    layout=widgets.Layout(width='150px', height='40px')
)

clear_button = widgets.Button(
    description='Clear',
    button_style='',
    layout=widgets.Layout(width='150px', height='40px')
)

output_area = widgets.Output()

# Exemples queries
example_queries = {
    "Breast cancer immunotherapy": "immunotherapy checkpoint inhibitors PD-1 PD-L1 breast cancer treatment response",
    "EGFR mutations lung cancer": "epidermal growth factor receptor EGFR mutation lung cancer targeted therapy",
    "Melanoma BRAF inhibitors": "melanoma BRAF V600E mutation targeted therapy vemurafenib dabrafenib",
    "Colorectal cancer biomarkers": "colorectal cancer biomarkers KRAS mutation microsatellite instability prognosis"
}

example_buttons = [widgets.Button(description=name, layout=widgets.Layout(width='200px')) 
                   for name in example_queries.keys()]

def on_search(b):
    with output_area:
        clear_output()
        
        query = query_input.value.strip()
        k = top_k_slider.value
        alpha = alpha_slider.value
        beta = 1.0 - alpha
        
        if not query:
            print("‚ö†Ô∏è Enter a query")
            return
        
        print(f"üîç Searching for: {query[:100]}...")
        print(f"Parameters: Top-{k}, Œ±={alpha:.1f} (TF-IDF), Œ≤={beta:.1f} (Word2Vec)\n")
        
        # Update retriever weights
        retriever.alpha = alpha
        retriever.beta = beta
        
        try:
            results = retriever.retrieve(query, top_k=k, return_scores=True)
            
            # Affichage HTML format√©
            html = f"""
            <style>
                .result-card {{
                    margin-bottom: 20px;
                    padding: 15px;
                    border-left: 4px solid #007acc;
                    background-color: #f8f9fa;
                    border-radius: 4px;
                }}
                .result-rank {{
                    font-weight: bold;
                    color: #007acc;
                    font-size: 18px;
                }}
                .result-title {{
                    font-size: 16px;
                    color: #333;
                    margin: 8px 0;
                    font-weight: 500;
                }}
                .result-meta {{
                    font-size: 12px;
                    color: #666;
                    margin-top: 8px;
                }}
                .score-bar {{
                    display: inline-block;
                    height: 10px;
                    background-color: #007acc;
                    margin-left: 5px;
                    vertical-align: middle;
                }}
            </style>
            <h3>üìÑ Top {len(results)} Results</h3>
            """
            
            for _, row in results.iterrows():
                score_width = int(row['score'] * 200)
                tfidf_width = int(row['tfidf_score'] * 100)
                w2v_width = int(row['w2v_score'] * 100)
                
                html += f"""
                <div class='result-card'>
                    <span class='result-rank'>#{row['rank']}</span>
                    <div class='result-title'>{row['title']}</div>
                    <div class='result-meta'>
                        PMID: {row['pmid']} | 
                        Combined: {row['score']:.3f} <span class='score-bar' style='width:{score_width}px;'></span><br>
                        TF-IDF: {row['tfidf_score']:.3f} <span class='score-bar' style='width:{tfidf_width}px; background-color:#28a745;'></span> | 
                        Word2Vec: {row['w2v_score']:.3f} <span class='score-bar' style='width:{w2v_width}px; background-color:#ffc107;'></span>
                    </div>
                </div>
                """
            
            display(HTML(html))
            
        except Exception as e:
            print(f"  Error: {e}")

def on_clear(b):
    query_input.value = ""
    with output_area:
        clear_output()

def on_example_click(button):
    query_input.value = example_queries[button.description]

# Connexions √©v√©nements
search_button.on_click(on_search)
clear_button.on_click(on_clear)
for btn in example_buttons:
    btn.on_click(on_example_click)

# Layout interface
print("="*80)
print(" SCIENTIFIC LITERATURE EXPLORER - Interactive Search")
print("="*80)

display(widgets.VBox([
    widgets.HTML("<h4>Example Queries (click to use):</h4>"),
    widgets.HBox(example_buttons[:2]),
    widgets.HBox(example_buttons[2:]),
    widgets.HTML("<br><h4>Custom Query:</h4>"),
    query_input,
    widgets.HTML("<br><h4>Search Parameters:</h4>"),
    widgets.HBox([top_k_slider, alpha_slider]),
    widgets.HTML("<br>"),
    widgets.HBox([search_button, clear_button]),
    widgets.HTML("<br>"),
    output_area
]))

ü§ñ SCIENTIFIC LITERATURE EXPLORER - Interactive Search


VBox(children=(HTML(value='<h4>Example Queries (click to use):</h4>'), HBox(children=(Button(description='Brea‚Ä¶

In [17]:
test_cases = {
    "NORMAL_CASES": {
        "Standard biomedical": "breast cancer treatment outcomes",
        "With biomarker": "EGFR mutation lung cancer prognosis",
        "Treatment specific": "pembrolizumab immunotherapy melanoma response",
        "Multi-concept": "colorectal cancer microsatellite instability chemotherapy resistance"
    },
    
    "AMBIGUOUS_TERMS": {
        "Polysemy 1": "cold",  # Maladie vs temp√©rature
        "Polysemy 2": "culture",  # Culture cellulaire vs culture sociale
        "Polysemy 3": "positive",  # Test positif vs affect positif
        "Acronym confusion": "AML"  # Acute Myeloid Leukemia vs autres AML
    },
    
    "OUT_OF_DOMAIN": {
        "Physics": "quantum entanglement particle physics",
        "Computer science": "neural network backpropagation algorithm",
        "History": "French revolution Napoleon Bonaparte",
        "Cooking": "chocolate cake recipe baking temperature"
    },
    
    "NONSENSE": {
        "Random words": "purple elephant dancing quantum spaghetti",
        "Gibberish": "xkjhfds qwerty asdfgh zxcvbn",
        "Numbers only": "123 456 789",
        "Special chars": "@@@ ### $$$ %%% &&&"
    },
    
    "TOO_SPECIFIC": {
        "Hyper-specific 1": "BRAF V600E mutation codon 600 valine glutamic acid substitution vemurafenib resistance mechanism ATP binding pocket conformational change",
        "Rare condition": "hereditary angioedema C1 esterase inhibitor deficiency bradykinin",
        "Ultra-rare gene": "LMNA R482W mutation progeria syndrome",
        "Specific trial": "CheckMate 067 nivolumab ipilimumab combination melanoma phase III results"
    },
    
    "TOO_GENERAL": {
        "Single word": "cancer",
        "Vague": "treatment",
        "Generic": "patient outcome study results",
        "Empty semantic": "medical research health disease"
    },
    
    "CONTRADICTORY": {
        "Opposite terms": "breast cancer prostate cancer",
        "Conflicting age": "pediatric elderly geriatric neonatal",
        "Mixed outcomes": "survival mortality improvement deterioration"
    },
    
    "TYPOS": {
        "Misspelling 1": "brest canser treatmant",
        "Misspelling 2": "melanomma immunotherappy",
        "Transposition": "ERFG mutaiton lugn cancre"
    },
    
    "SYNONYMS_TEST": {
        "Medical synonyms": "myocardial infarction heart attack",
        "Drug names": "acetaminophen paracetamol tylenol",
        "Anatomy": "colon large intestine bowel"
    }
}

# Fonction test avec analyse
def test_query_detailed(query, expected_behavior=""):
    """Test query avec analyse r√©sultats"""
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    if expected_behavior:
        print(f"Expected: {expected_behavior}")
    print('-'*80)
    
    try:
        results = retriever.retrieve(query, top_k=5, return_scores=True)
        
        if len(results) == 0:
            print("  NO RESULTS")
            return
        
        avg_score = results['score'].mean()
        max_score = results['score'].max()
        
        print(f" Avg Score: {avg_score:.4f} | Max Score: {max_score:.4f}")
        
        if max_score < 0.05:
            print("‚ö†Ô∏è VERY LOW SCORES - Likely irrelevant")
        elif max_score < 0.15:
            print("‚ö†Ô∏è LOW SCORES - Weak matches")
        elif max_score < 0.30:
            print("‚úì MODERATE SCORES - Acceptable matches")
        else:
            print("‚úì‚úì GOOD SCORES - Strong matches")
        
        print("\nTop 3 results:")
        for _, row in results.head(3).iterrows():
            print(f"  {row['rank']}. [{row['score']:.3f}] {row['title'][:70]}...")
        
        # Analyse diversit√© r√©sultats
        if len(results) >= 5:
            top5_titles = results.head(5)['title'].tolist()
            # Check si titres tr√®s similaires (signe de bon clustering)
            print(f"\n  Result diversity: {'High' if len(set([t[:30] for t in top5_titles])) == 5 else 'Low (clustered)'}")
        
    except Exception as e:
        print(f"  ERROR: {e}")

# Ex√©cution tests par cat√©gorie
for category, queries in test_cases.items():
    print(f"\n\n{'#'*80}")
    print(f"# CATEGORY: {category}")
    print(f"{'#'*80}")
    
    for name, query in queries.items():
        test_query_detailed(query, expected_behavior=name)



################################################################################
# CATEGORY: NORMAL_CASES
################################################################################

Query: 'breast cancer treatment outcomes'
Expected: Standard biomedical
--------------------------------------------------------------------------------
üìä Avg Score: 0.8943 | Max Score: 0.9835
‚úì‚úì GOOD SCORES - Strong matches

Top 3 results:
  1. [0.984] Nanomedicine applications in the treatment of breast cancer: current s...
  2. [0.924] Patient Navigation in Breast Cancer Treatment and Survivorship: A Syst...
  3. [0.896] Breast cancer in the personal genomics era....

üí° Result diversity: High

Query: 'EGFR mutation lung cancer prognosis'
Expected: With biomarker
--------------------------------------------------------------------------------
üìä Avg Score: 0.9434 | Max Score: 0.9962
‚úì‚úì GOOD SCORES - Strong matches

Top 3 results:
  1. [0.996] Epidermal growth factor receptor gene a