In [225]:
import requests
import re


def extract_terms(text):
    # Utilise une expression régulière pour rechercher le terme préféré
    pref_t = None
    n_pref_term = []

    # Utilise une expression régulière pour rechercher le terme préféré
    match = re.search(r"^1: (.+)", text, re.MULTILINE)
    if match:
        pref_t = match.group(1).strip()
        # Si le terme préféré est suivi de "[Supplementary Concept]"
        if "[Supplementary Concept]" in pref_t:
            # Extrait le terme sans le suffixe
            pref_t = pref_t.split("[Supplementary Concept]")[0].strip()

    # Utilise des expressions régulières pour trouver les entrées des termes préférés
    entry_terms_match = re.search(r"Entry Terms:\s*(.*?)\n\n", text, re.DOTALL)
    if entry_terms_match:
        entry_terms = entry_terms_match.group(1).strip().split("\n")
        # Les termes préférés se trouvent après les entrées de termes
        n_pref_term = [term.strip() for term in entry_terms if term.strip()]

    return pref_t, n_pref_term


def search_mesh(query):
    # Paramètres de la requête
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    search_url = base_url + "esearch.fcgi"
    params = {
        "db": "mesh",  # Base de données MeSH
        "term": query,  # Terme de recherche
        "retmode": "json",  # Format de sortie JSON
    }

    # Effectuer la requête HTTP
    response = requests.get(search_url, params=params)

    # Vérifier le code de statut de la réponse
    if response.status_code == 200:
        mesh_ids = []
        # Extraire les identifiants des concepts MeSH pertinents depuis la réponse JSON
        try:
            mesh_ids = response.json()["esearchresult"]["idlist"]
        except KeyError:
            try:
                mesh_ids = response.json()["esearchresult"]["IdList"]
            except KeyError:
                mesh_ids = []

        return mesh_ids
    else:
        print("Erreur lors de la requête HTTP:", response.status_code)
        return []


# Fonction pour récupérer les informations détaillées sur un concept MeSH
def get_mesh_details(mesh_id):
    # Paramètres de la requête
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    detail_url = base_url + "efetch.fcgi"
    params = {
        "db": "mesh",  # Base de données MeSH
        "id": mesh_id,  # Identifiant du concept MeSH
        "retmode": "xml",  # Format de sortie JSON
    }

    # Effectuer la requête HTTP
    response = requests.get(detail_url, params=params)

    # Vérifier le code de statut de la réponse
    if response.status_code == 200:
        return response
    else:
        print("Erreur lors de la requête HTTP:", response.status_code)
        return None


def mesh_concept(term):
    mesh_ids = search_mesh(term)
    if len(mesh_ids) == 0:
        return "None"
    id = mesh_ids[0]
    mesh_details = get_mesh_details(id)
    if mesh_details:
        p, np = extract_terms(mesh_details.text)
        return [p, *np], id
    else:
        return "None"

In [226]:
concept, _id = mesh_concept("confinement")
print(concept)

['Quantum Dots', 'Dot, Quantum', 'Dots, Quantum', 'Quantum Dot', 'Semiconductor Nanoparticles', 'Nanoparticle, Semiconductor', 'Nanoparticles, Semiconductor', 'Semiconductor Nanoparticle', 'Semiconductor Nanocrystals', 'Nanocrystal, Semiconductor', 'Semiconductor Nanocrystal', 'Nanocrystals, Semiconductor']


``` markdown
**Pretraitement**
```

In [2]:
import re
import string
from nltk.stem import PorterStemmer
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# nltk.download("punkt")


def preprocess_text(
    text,
    lowercase=True,
    remove_punctuation=True,
    remove_digits=True,
    remove_stopwords=True,
    lemma=False,
    stem=False,
):

    # Tokenize the sentence into words
    words = word_tokenize(text)

    processed_words = []

    for word in words:
        # Supprimer la ponctuation
        if remove_punctuation:
            p = string.punctuation
            p += "\n\r\t"  # Ajouter les retours chariot, tabulation
            word = word.translate(str.maketrans(p, " " * len(p)))
            clean_word = re.sub(r"\s[a-z]\s", " ", word)
            word = clean_word.strip()
            word = (
                unicodedata.normalize("NFD", word)
                .encode("ascii", "ignore")
                .decode("utf-8")
            )

        # Convertir en minuscules
        if lowercase:
            word = word.lower()

        # Supprimer les chiffres
        if remove_digits:
            word = re.sub(
                "[0-9]+", "", word
            )  # Remplacer une séquence de chiffres par rien

        # Supprimer les stopwords
        if remove_stopwords:
            stop_words = set(stopwords.words("english"))
            if word in stop_words:
                continue

        # Lemmatisation
        if lemma:
            lem = WordNetLemmatizer()
            word = lem.lemmatize(word)

        if stem:
            ps = PorterStemmer()
            word = ps.stem(word)

        processed_words.append(word)

    # # Join the processed words back into a sentence
    processed_text = " ".join(processed_words).strip()

    return processed_text


# Exemple d'utilisation
text = "[(]The quick-brown-5fox jumps over the lazy dog! 123"
processed_text = preprocess_text(text, lemma=True)
print(processed_text.split(" "))

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [3]:
import pyterrier as pt
import os
import pandas as pd
import numpy as np

In [227]:
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-22"
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



In [228]:
dataset = pt.datasets.get_dataset("irds:cord19/trec-covid")
indexer = pt.index.IterDictIndexer(
    "C:\\Users\\Saliou\\OneDrive\\Documents\\ALL TP\\RI\\projet\\cord19-index",
    overwrite=True,
)

### Baseline BM25

In [229]:
indexref = indexer.index(dataset.get_corpus_iter(), fields=("title", "abstract"))
index = pt.IndexFactory.of(indexref)

cord19/trec-covid documents:   0%|          | 0/192509 [00:00<?, ?it/s]



cord19/trec-covid documents:   1%|          | 2149/192509 [00:03<04:02, 784.23it/s] 



cord19/trec-covid documents: 100%|██████████| 192509/192509 [01:31<00:00, 2104.04it/s]


18:20:58.663 [main] ERROR org.terrier.structures.indexing.Indexer - Could not finish MetaIndexBuilder: 
java.io.IOException: Key 8lqzfj2e is not unique: 37597,11755
For MetaIndex, to suppress, set metaindex.compressed.reverse.allow.duplicates=true
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.mergeTwo(FSOrderedMapFile.java:1374)
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.close(FSOrderedMapFile.java:1308)
	at org.terrier.structures.indexing.BaseMetaIndexBuilder.close(BaseMetaIndexBuilder.java:321)
	at org.terrier.structures.indexing.classical.BasicIndexer.indexDocuments(BasicIndexer.java:270)
	at org.terrier.structures.indexing.classical.BasicIndexer.createDirectIndex(BasicIndexer.java:388)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:377)
18:21:05.000 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 60 empty documents
18:21:05.008 [main] ERROR org.terrier.structures.indexing.Indexer - Could not re

In [230]:
# statistiques de la collection
print(index.getCollectionStatistics().toString())

Number of documents: 192509
Number of terms: 158515
Number of postings: 12290426
Number of fields: 2
Number of tokens: 19603234
Field names: [title, abstract]
Positions:   false



## Document Expansion

In [231]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [232]:
res = bm25.search("coronavirus")
res

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,78615,2gvvwvvg,0,3.691090,coronavirus
1,1,73872,azir1gvm,1,3.661167,coronavirus
2,1,13737,t7sn9ffh,2,3.645867,coronavirus
3,1,185382,xykob69g,3,3.639548,coronavirus
4,1,82198,7972ps41,4,3.628132,coronavirus
...,...,...,...,...,...,...
995,1,75601,5w39q98c,995,3.224067,coronavirus
996,1,81443,knr07yy1,996,3.224067,coronavirus
997,1,82053,bhfgu5wg,997,3.224067,coronavirus
998,1,87342,qrg6nrrm,998,3.224067,coronavirus


In [4]:
def similarite(bm25_doc, index_concept, docno_doc, docno_conc, concept):
    cj = [0]  # poids des termes du concept
    dj = [0]  # poids des termes du document

    bm25 = pt.BatchRetrieve(index_concept, wmodel="BM25")

    # parcourir chaque term du concept et verifier s'il est dans le document
    concept = concept.split(" ")
    for term in concept:
        # bm25 pour les documents
        res1 = bm25_doc.search(term)  # chercher le terme dans le document
        res1 = res1["score"][
            res1["docno"] == docno_doc
        ]  # chercher le score du terme dans le document

        if len(res1) == 0:
            continue

        # bm25 pour les concepts
        res2 = bm25.search(term)  # chercher le terme dans le concept
        res2 = res2["score"][res2["docno"] == docno_conc]

        if len(res2) == 0:
            continue

        dj.append(res1)
        cj.append(res2)

    # calculer la similarité
    numerateur = sum([c * d for c, d in zip(cj, dj)])
    denominateur = np.sqrt(sum([c**2 for c in cj])) * np.sqrt(sum([d**2 for d in dj]))

    if denominateur == 0:
        return 0

    return numerateur / denominateur


def average_position(occurrences):
    return sum(occurrences) / len(occurrences)


def word_positions(words):
    positions = {}
    for index, word in enumerate(words):
        if word in positions:
            positions[word].append(index)
        else:
            positions[word] = [index]
    avg_positions = {word: average_position(pos) for word, pos in positions.items()}
    avg_positions = sorted(avg_positions.items(), key=lambda x: x[1])
    list_words = [word for word, _ in avg_positions]
    return list_words


def normalize_document(words, concept_words):

    start, end = -1, -1
    for i, word in enumerate(words):
        if word in concept_words:
            if start == -1:
                start = i
            end = i
    if start == -1 or end == -1:
        return []
    return words[start : end + 1]


import scipy.stats


def spearman_rank_correlation(normalize_document, concept_positions):

    common_words = set(normalize_document).intersection(concept_positions)

    rank_concept = {word: i + 1 for i, word in enumerate(concept_positions)}
    rank_doc = {word: i + 1 for i, word in enumerate(normalize_document)}

    # print(rank_concept)
    # print(rank_doc)
    # print("Mots communs:", common_words)

    T = len(common_words)
    if T == 0 or T == 1:
        return 0  # Pas de mots communs

    # Générer les rangs pour les mots communs
    doc_ranks = []
    concept_ranks = []
    for i, word in enumerate(common_words):
        doc_ranks.append(rank_doc[word])
        concept_ranks.append(rank_concept[word])

    # print("Rangs du document:", doc_ranks)
    # print("Rangs du concept:", concept_ranks)

    return scipy.stats.spearmanr(doc_ranks, concept_ranks).correlation


def rel(sim, corr):
    return (1 + sim) * (1 + corr)

In [237]:
doc_iter = dataset.get_corpus_iter()

# Initialize a list to hold the abstracts
N_rank = 5

# Iterate over the documents
for i, doc in enumerate(doc_iter):

    df_concepts = pd.DataFrame(
        columns=["docno", "terms_mesh"]
    )  # dataframe des concepts
    dic_concepts = {}  # dictionnaire des concepts preferés

    # Get the abstract of the document
    title = doc.get("title", None)
    # abstract = doc.get("abstract", None)
    abstract = ""

    # document ID
    docno = doc.get("docno", None)
    if docno is None:
        print("erreur sur le docno")
        break

    # If the document has an abstract, add it to the list
    if abstract or title:
        text = preprocess_text(title + " " + abstract, stem=True).split(" ")

    # for each term in the text we will search for the mesh concept
    for term in text:
        concept_id = mesh_concept(term)
        if concept_id == "None":
            continue
        concept, id_term = concept_id

        dic_concepts[id_term] = concept  # dictionnaire des concepts preferés
        concept_pr = preprocess_text(" ".join(concept), stem=True).split(" ")

        new_row = pd.DataFrame(
            {"docno": [id_term], "terms_mesh": [" ".join(concept_pr)]}
        )
        df_concepts = pd.concat([df_concepts, new_row], ignore_index=True)

    # indexer les concepts dans le dataframe
    # indexer_tmp = pt.index.DFIndexer(
    #     "C:\\Users\\Saliou\\OneDrive\\Documents\\ALL TP\\RI\\projet\\tmp",
    #     overwrite=True,
    # )

    display(df_concepts)
    break

    index_ref_tmp = indexer_tmp.index(df_concepts["terms_mesh"], df_concepts["docno"])
    print(index_ref_tmp)
    index_tmp = pt.IndexFactory.of(index_ref_tmp)
    print(index_tmp.getCollectionStatistics().toString())

    # 1. Calculer la similarité entre le document et chaque concept
    # parcourir chaque term du concept et verifier s'il est dans le document
    dic_sim = {}
    for i, c in enumerate(df_concepts):
        entry_terms = c["terms_mesh"]
        id_concept = c["docno"]
        score = similarite(bm25, index_tmp, docno, id_term, entry_terms)
        dic_sim[id_concept] = score

    # trier les documents par ordre décroissant
    dic_sim = dict(sorted(dic_sim.items(), key=lambda item: item[1], reverse=True))

    # 2. Calculer la corrélation entre le document et le concept
    dic_rel = {}

    for id_term, sim in dic_sim.items():
        Entry_terms = dic_concepts[id_term]

        corr_entry = []
        for entry in Entry_terms:
            Entry_term = preprocess_text(entry, stem=True).split(" ")

            # vecteurs de positions moyennes
            vect_entry = word_positions(Entry_term)
            vect_doc = word_positions(text)

            # fenêtre du document
            vect_doc_norm = normalize_document(vect_doc, vect_entry)

            # calculer la corrélation
            corr_entry.append(spearman_rank_correlation(vect_doc_norm, vect_entry))

        # 3. ρ(C, D) = M axE∈Entries(C) ρ(E, D)
        corr = max(corr_entry)

        # 4. Selecting the candidate concepts for document expansion.
        sim = dic_sim[id_term]
        rel_score = rel(sim, corr)
        dic_rel[id_term] = rel_score

    # Expension du document
    dic_rel = dict(sorted(dic_rel.items(), key=lambda item: item[1], reverse=True))

    # Selecting the top N concepts for document expansion only if the rel score is greater than the threshold
    threshold = 2.5
    top_concepts = []
    for i, (id_term, score) in enumerate(dic_rel.items()):
        if i == N_rank:
            break
        if score >= threshold:
            top_concepts.append(id_term)

    # 5. Expanding the document with the selected concepts
    expanded_text = abstract
    if len(top_concepts) == 0:
        continue

    # Ajouter les termes préférés des concepts sélectionnés
    for id_term in top_concepts:
        preferred_term = dic_concepts[id_term]
        expanded_text = expanded_text + " " + preferred_term

    # modifier le document original
    dataset.get_corpus_iter().update(docno, {"abstract": expanded_text})

    break

# Reindexer le corpus
# indexer = pt.index.IterDictIndexer(
#     "C:\\Users\\Saliou\\OneDrive\\Documents\\ALL TP\\RI\\projet\\cord19-index",
#     overwrite=True,
# )

cord19/trec-covid documents:   0%|          | 0/192509 [00:00<?, ?it/s]

Unnamed: 0,docno,terms_mesh
0,68064876,safety net provid provid safety net provid s...
1,82003470,cultur media pharmacolog action
2,68018805,sepsi bloodstream infect bloodstream infect in...
3,68045805,mycoplasma synovia
4,68061387,chlamydi pneumonia chlamydi pneumonia pneumoni...
5,68053490,mink enter viru mink enter virus enter virus ...
6,68059705,polar bodi bodi polar bodi polar polar bodi ...
7,68065207,middl east respiratori syndrom coronaviru mer ...
8,68012529,saudi arabia kingdom saudi arabia


In [224]:
vect_doc = [
    "quick",
    "banjoule",
    "dog",
    "quicke",
    "banjoulde",
    "lazy",
    "the",
    "dog6",
    "nop",
    "mat",
]
vect_concept = ["mat", "dog", "quick"]

vec = word_positions(vect_doc)
print(vec)
x = normalize_document(vec, vect_concept)
print(x)
conc = word_positions(vect_concept)
print(conc)
spearman_rank_correlation(x, conc)

['quick', 'banjoule', 'dog', 'quicke', 'banjoulde', 'lazy', 'the', 'dog6', 'nop', 'mat']
['quick', 'banjoule', 'dog', 'quicke', 'banjoulde', 'lazy', 'the', 'dog6', 'nop', 'mat']
['mat', 'dog', 'quick']
{'mat': 1, 'dog': 2, 'quick': 3}
{'quick': 1, 'banjoule': 2, 'dog': 3, 'quicke': 4, 'banjoulde': 5, 'lazy': 6, 'the': 7, 'dog6': 8, 'nop': 9, 'mat': 10}
Mots communs: {'dog', 'mat', 'quick'}
Rangs du document: [3, 10, 1]
Rangs du concept: [2, 1, 3]


-1.0

### BaseLine BM25

In [288]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
pt.Experiment(
    [bm25],
    dataset.get_topics(variant="title"),
    dataset.get_qrels(),
    eval_metrics=["map", "P_10", "P_20", "P_30"],
)

Unnamed: 0,name,map,P_10,P_20,P_30
0,BR(BM25),0.207121,0.678,0.62,0.597333


### QE.

In [289]:
# Configurer le modèle BM25
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

# Initialisation de la liste pour stocker les résultats
results = []

# Définir les paramètres pour l'expansion de requête
for num_terms in range(100, 101, 5):
    for num_docs in [100]:

        qe = pt.rewrite.Bo1QueryExpansion(index, fb_terms=num_terms, fb_docs=num_docs)
        pipeline = bm25 >> qe >> bm25
        res = pt.Experiment(
            [pipeline],
            dataset.get_topics(variant="title"),
            dataset.get_qrels(),
            eval_metrics=["map"],
        )
        # Ajouter les résultats à la liste
        results.append(
            {
                "num_terms": num_terms,
                "num_docs": num_docs,
                "map": res["map"][0],
            }
        )
# Convertir les résultats en DataFrame
df_results = pd.DataFrame(results)

# Pivoter le DataFrame pour que num_terms soit les colonnes et num_docs les lignes
df_pivot = df_results.pivot(index="num_docs", columns="num_terms", values="map")

In [296]:
qe = pt.rewrite.Bo1QueryExpansion(index, fb_terms=20, fb_docs=20)
pipeline = bm25 >> qe >> bm25
res = pt.Experiment(
    [pipeline],
    dataset.get_topics(variant="title"),
    dataset.get_qrels(),
    names=[f"BM25+Bo1QE"],
    eval_metrics=["map", "P_10", "P_20", "P_30"],
)
res

Unnamed: 0,name,map,P_10,P_20,P_30
0,BM25+Bo1QE,0.233246,0.658,0.643,0.622


### DEcombination

In [None]:
# Reindexer le corpus
# indexer = pt.index.IterDictIndexer(
#     "C:\\Users\\Saliou\\OneDrive\\Documents\\ALL TP\\RI\\projet\\cord19-index",
#     overwrite=True,
# )
index_aug = "a definir "

bm25 = pt.BatchRetrieve(index_aug, wmodel="BM25")
pt.Experiment(
    [bm25],
    dataset.get_topics(variant="title"),
    dataset.get_qrels(),
    eval_metrics=["map", "P_10", "P_20", "P_30"],
)

### QE+DEcombination


In [None]:
bm25 = pt.BatchRetrieve(index_aug, wmodel="BM25")
qe = pt.rewrite.Bo1QueryExpansion(index, fb_terms=20, fb_docs=20)
pipeline = bm25 >> qe >> bm25
res = pt.Experiment(
    [pipeline],
    dataset.get_topics(variant="title"),
    dataset.get_qrels(),
    names=[f"BM25+Bo1QE"],
    eval_metrics=["map", "P_10", "P_20", "P_30"],
)