In [10]:
# Load the documents
with open("dev.docs", "r", encoding='utf-8') as f:
    docs_content = f.readlines()

# Load the queries
with open("dev.all.queries", "r",encoding='utf-8') as f:
    queries_content = f.readlines()

# Load the relevance scores
with open("dev.2-1-0.qrel", "r",encoding='utf-8') as f:
    relevance_content = f.readlines()

Based on the samples provided:

  Documents (dev.docs):
        Each line starts with a unique identifier, followed by the content of the document.
        The content seems to be related to medical and health topics.

  Queries (dev.all.queries):
        Each line starts with a unique identifier, followed by the query content and some associated tags.
        The queries seem to be short phrases or questions related to medical and health topics.

  Relevance Scores (dev.2-1-0.qrel):
        Each line contains a query identifier, followed by some constant (which seems to be always 0 in the sample), a document identifier, and a relevance score.
        The relevance score seems to be an integer (e.g., 2), possibly indicating the degree of relevance between the query and the document.

In [11]:
# Parse and structure the data

# Parse documents
docs = {}
for line in docs_content:
    parts = line.split("\t", 1)
    if len(parts) == 2:
        doc_id, content = parts
        docs[doc_id] = content.strip()

# Parse queries
queries = {}
for line in queries_content:
    parts = line.split("\t", 1)
    if len(parts) == 2:
        query_id, content = parts
        queries[query_id] = content.strip()

# Parse relevance scores
relevance = {}
for line in relevance_content:
    parts = line.split("\t")
    if len(parts) == 4:
        query_id, _, doc_id, score = parts
        if query_id not in relevance:
            relevance[query_id] = {}
        relevance[query_id][doc_id] = int(score)

# Display the number of parsed documents, queries, and relevance scores
len(docs), len(queries), len(relevance)

(3193, 325, 324)

In [12]:
#relevance

The dataset has been structured as follows:

  Documents: 3,193 entries

  Queries: 325 entries
  
  Relevance Scores: 324 entries (one query seems to be missing from the relevance data)

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sraps\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
# Initialisation du lemmatisateur et des stopwords

lemmatizer = WordNetLemmatizer()

# Download necessary NLTK resources
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sraps\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sraps\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
# Define a function to preprocess text
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    return tokens

# Fonction pour le prétraitement des textes avec lemmatisation
def preprocess_text(text):
    # Tokenisation
    word_tokens = word_tokenize(text.lower())
    # Lemmatisation et suppression des stopwords
    tokens = [lemmatizer.lemmatize(w) for w in word_tokens if w not in stopwords.words('english') and len(w) > 2]
    return tokens


# Preprocess documents and queries
preprocessed_docs = {doc_id: preprocess(content) for doc_id, content in docs.items()}
preprocessed_queries = {query_id: preprocess(content) for query_id, content in queries.items()}

# Preprocess documents and queries
preprocessed_docs_lemmatisation = {doc_id: preprocess_text(content) for doc_id, content in docs.items()}
preprocessed_queries_lemmatisation = {query_id: preprocess_text(content) for query_id, content in queries.items()}

# Display sample preprocessed data
#list(preprocessed_docs.items())[:2], list(preprocessed_queries.items())[:2]

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
# Convert tokenized documents and queries back to string format for TF-IDF vectorization
docs_string = [" ".join(tokens) for tokens in preprocessed_docs.values()]
queries_string = [" ".join(tokens) for tokens in preprocessed_queries.values()]

# Initialize a TF-IDF vectorizer and fit on the documents
vectorizer = TfidfVectorizer()
docs_tfidf = vectorizer.fit_transform(docs_string)

# Transform the queries using the same vectorizer
queries_tfidf = vectorizer.transform(queries_string)

# Compute cosine similarity between queries and documents
cosine_similarities = cosine_similarity(queries_tfidf, docs_tfidf)

# Display the shape of the cosine similarities matrix
cosine_similarities.shape

(325, 3193)

In [18]:
cosine_similarities

array([[0.10791079, 0.03017106, 0.01821751, ..., 0.01822421, 0.05981088,
        0.07504941],
       [0.0988361 , 0.00767112, 0.        , ..., 0.00181114, 0.04042952,
        0.        ],
       [0.05368846, 0.03522577, 0.01830408, ..., 0.0237221 , 0.06460136,
        0.06977015],
       ...,
       [0.01793928, 0.00487686, 0.00206799, ..., 0.0093412 , 0.        ,
        0.        ],
       [0.00624186, 0.        , 0.00675299, ..., 0.        , 0.        ,
        0.        ],
       [0.00420271, 0.        , 0.        , ..., 0.00156988, 0.        ,
        0.        ]])

In [19]:
from sklearn.metrics import ndcg_score

In [20]:
import numpy as np

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Create a TF-IDF vectorizer for documents
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in preprocessed_docs.values()])

# Create a function to retrieve the top-k documents for a query using TF-IDF
def retrieve_top_documents(query, k=5):
    query_tfidf = tfidf_vectorizer.transform([query])
    cosine_similarities = linear_kernel(query_tfidf, tfidf_matrix).flatten()
    related_docs_indices = cosine_similarities.argsort()[::-1]
    return [(list(preprocessed_docs.keys())[i], cosine_similarities[i]) for i in related_docs_indices][:k]


In [36]:
def calculate_ndcg(query_id, retrieved_docs, relevance_data):
    if query_id in relevance_data:
        # Extract relevance scores for retrieved documents
        retrieved_scores = [relevance_data[query_id].get(doc_id, 0) for doc_id, _ in retrieved_docs]
        
        # Calculate DCG (Discounted Cumulative Gain)
        dcg = retrieved_scores[0] + sum([(retrieved_scores[i] / np.log2(i + 2)) for i in range(1, len(retrieved_scores))])

        # Sort the relevance scores in descending order to calculate ideal DCG
        ideal_scores = [score for doc_id, score in sorted(relevance_data[query_id].items(), key=lambda x: x[1], reverse=True)]
        idcg = ideal_scores[0] + sum([(ideal_scores[i] / np.log2(i + 2)) for i in range(1, min(len(ideal_scores), len(retrieved_scores)))])
        
        # Calculate NDCG
        if idcg == 0:
            return 0.0
        else:
            return dcg / idcg
    else:
        # Handle missing query IDs by returning a default NDCG score of 0
        return 0.0


In [47]:
# Define N_docs to limit the number of retrieved documents
N_docs = 5

# Initialize a list to store NDCG scores for all queries
ndcg_scores = []

# Loop through each query and evaluate
for query_id, query_tokens in preprocessed_queries.items():
    query = ' '.join(query_tokens)

    # Retrieve the top N_docs documents for the query
    retrieved_docs = retrieve_top_documents(query, k=N_docs)

    if not retrieved_docs:
        print(f"No relevant documents found for query {query_id}.")
    else:
        # Calculate and display the NDCG score for the query based on the top N_docs documents
        ndcg = calculate_ndcg(query_id, retrieved_docs, relevance)
        

        # Append the NDCG score to the list
        ndcg_scores.append(ndcg)

# Calculate the average NDCG score
average_ndcg = sum(ndcg_scores) / len(ndcg_scores)

# Display the average NDCG score
print(f"Average NDCG Score for all queries: {average_ndcg:.4f}")


Average NDCG Score for all queries: 0.3551
