In [1]:
import os
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [2]:
#read txt files to list
directory = "C:/Users/邵一泉/OneDrive - Queen Mary, University of London/Information/Assignment 3/small_dataset/half_preprocessed_dataset"

def load_document(filename):
    with open(filename, "r", encoding="utf-8", errors="ignore") as f:
        doc = f.read()  # Read text file
        doc_id = os.path.splitext(os.path.basename(filename))[0]  # Get filename without extension
        return doc_id, doc

files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".txt")]
results = Parallel(n_jobs=-1, verbose=10)(
    delayed(load_document)(f) for f in tqdm(files, desc="Loading documents")
)

doc_ids, docs = zip(*results)

Loading documents:   0%|          | 0/20519 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1944s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0317s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0309s.) Setting batch_

In [3]:
from rank_bm25 import BM25Okapi
# Create BM25 model
bm25 = BM25Okapi([doc.split() for doc in docs] )

In [4]:
import pandas as pd
# Read the txt file into a DataFrame
df_rel = pd.read_csv('C:/Users/邵一泉/OneDrive - Queen Mary, University of London/Information/Assignment 3/trecgen2007.all.judgments.tsv.txt', 
                     sep='\t', header=6, names=['TOPIC', 'PMID', 'OFFSET', 'LENGTH', 'RELEVANCE'])
df_rel['RELEVANCE'] = df_rel['RELEVANCE'].replace({'NOT_RELEVANT': 0, 'RELEVANT': 1})
# Read the file into a list of strings
with open('C:/Users/邵一泉/OneDrive - Queen Mary, University of London/Information/Assignment 3/2007topics.txt', 'r') as f:
    lines = f.read().splitlines()

# Create a list of dictionaries with the question data
data = []
for line in lines:
    q_num, q_text = line.split('>', maxsplit=1)
    q_num = int(q_num[1:])
    data.append({
        'TOPIC': q_num,
        'QUERY': q_text,
    })

# Create a DataFrame from the list of dictionaries
df_topic = pd.DataFrame(data)
df = pd.merge(df_rel, df_topic, on='TOPIC')
df

Unnamed: 0,TOPIC,PMID,OFFSET,LENGTH,RELEVANCE,QUERY
0,200,7493960,39964,2283,0,What serum [PROTEINS] change expression in ass...
1,200,7657648,17646,1004,0,What serum [PROTEINS] change expression in ass...
2,200,7713924,19475,2376,0,What serum [PROTEINS] change expression in ass...
3,200,8557639,3092,1488,0,What serum [PROTEINS] change expression in ass...
4,200,8621728,17314,1683,0,What serum [PROTEINS] change expression in ass...
...,...,...,...,...,...,...
35991,235,16275766,76498,41890,0,Which [GENES] involved in NFkappaB signaling r...
35992,235,16291660,8048,1214,0,Which [GENES] involved in NFkappaB signaling r...
35993,235,16314491,95642,458,0,Which [GENES] involved in NFkappaB signaling r...
35994,235,16339966,110929,344,0,Which [GENES] involved in NFkappaB signaling r...


In [5]:
queries = df['QUERY'].unique()

In [6]:
queries[0]

'What serum [PROTEINS] change expression in association with high disease activity in lupus?'

In [7]:
import nltk
nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words("english")
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
stemmer = nltk.stem.PorterStemmer()
import html

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\邵一泉\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
#define a query function
def query(query, k=5):
    query = html.unescape(query)  # Decode HTML entities
    preprocessed_query = [stemmer.stem(token) for token in tokenizer.tokenize(query.lower()) if token not in stop_words]
    doc_scores = bm25.get_scores(preprocessed_query)

    # Get top-k similar documents
    index = []
    top_k_doc_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:k]
    for i, doc_index in enumerate(top_k_doc_indices):
        index.append(int(doc_ids[doc_index]))
    return index

In [9]:
query(queries[0])

[12595613, 12642605, 12456587, 12154204, 12730535]

In [10]:
import numpy as np
#define evaluation function(MAP) caculate the average score of all queries
def evaluate_query(query_func, df, k=5):
    aps = []
    for query in df['QUERY'].unique():
        relevant_docs = set(df.loc[(df['QUERY'] == query) & (df['RELEVANCE'] == 1), 'PMID'])
        retrieved_docs = query_func(query, k)
        p_at_k = []
        for i, doc_id in enumerate(retrieved_docs):
            if doc_id in relevant_docs:
                p_at_k.append((len(p_at_k) + 1) / (i + 1))
        if len(p_at_k) > 0:
            aps.append(np.mean(p_at_k))
    return np.mean(aps)

map_score = evaluate_query(query, df)
print(f"Mean Average Precision (MAP) score for the query function: {map_score:.4f}")

Mean Average Precision (MAP) score for the query function: 0.7952


In [11]:
#define evaluation function(NDCG) caculate the average score of all queries
def dcg(relevances, k):
    """Compute the Discounted Cumulative Gain (DCG) at k."""
    discounts = np.log2(np.arange(2, k + 2))
    return np.sum(relevances[:k] / discounts)

def ndcg(query_func, df, k=5):
    ndcg_scores = []
    for query in df['QUERY'].unique():
        relevant_docs = set(df.loc[(df['QUERY'] == query) & (df['RELEVANCE'] == 1), 'PMID'])
        retrieved_docs = query_func(query, k)
        relevances = [1 if doc_id in relevant_docs else 0 for doc_id in retrieved_docs]
        idcg = dcg(sorted(relevances, reverse=True), k)
        if idcg == 0:
            ndcg_scores.append(0)
        else:
            ndcg_scores.append(dcg(relevances, k) / idcg)
    return np.nanmean(ndcg_scores)

ndcg_score = ndcg(query, df)
print(f"Normalized Discounted Cumulative Gain (NDCG) score for the query function: {ndcg_score:.4f}")

Normalized Discounted Cumulative Gain (NDCG) score for the query function: 0.5481


In [12]:
#define evaluation function(NDCG) caculate the each score of all queries
def ndcg(query_func, df, k=5):
    ndcg_scores = {}
    for query in df['QUERY'].unique():
        relevant_docs = set(df.loc[(df['QUERY'] == query) & (df['RELEVANCE'] == 1), 'PMID'])
        retrieved_docs = query_func(query, k)
        relevances = [1 if doc_id in relevant_docs else 0 for doc_id in retrieved_docs]
        idcg = dcg(sorted(relevances, reverse=True), k)
        if idcg == 0:
            ndcg_scores[query] = 0
        else:
            ndcg_scores[query] = dcg(relevances, k) / idcg
    return ndcg_scores

In [13]:
#print the each score of all queries
ndcg_scores = ndcg(query, df)
for query, ndcg_score in ndcg_scores.items():
    print(f"NDCG score for query '{query}': {ndcg_score:.4f}")

NDCG score for query 'What serum [PROTEINS] change expression in association with high disease activity in lupus?': 0.9829
NDCG score for query 'What [MUTATIONS] in the Raf gene are associated with cancer?': 1.0000
NDCG score for query 'What [DRUGS] are associated with lysosomal abnormalities in the nervous system?': 0.0000
NDCG score for query 'What [CELL OR TISSUE TYPES] express receptor binding sites for vasoactive intestinal peptide (VIP) on their cell surface?': 1.0000
NDCG score for query 'What nervous system [CELL OR TISSUE TYPES] synthesize neurosteroids in the brain?': 1.0000
NDCG score for query 'What [SIGNS OR SYMPTOMS] of anxiety disorder are related to coronary artery disease?': 0.9197
NDCG score for query 'What [TOXICITIES] are associated with zoledronic acid?': 1.0000
NDCG score for query 'What [TOXICITIES] are associated with etidronate?': 0.0000
NDCG score for query 'What [BIOLOGICAL SUBSTANCES] have been used to measure toxicity in response to zoledronic acid?': 0.850

In [15]:
import tkinter as tk
from tkinter import scrolledtext
from functools import partial
import html

# Define the query function
def query(query, k=5):
    query = html.unescape(query)  # Decode HTML entities
    preprocessed_query = [stemmer.stem(token) for token in tokenizer.tokenize(query.lower()) if token not in stop_words]
    doc_scores = bm25.get_scores(preprocessed_query)

    # Get top-k similar documents
    index = []
    top_k_doc_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:k]
    for i, doc_index in enumerate(top_k_doc_indices):
        index.append(int(doc_ids[doc_index]))
    return index

# Initialize Tkinter window（UI)
window = tk.Tk()
window.title("Group 10 - search engine")

# Define query function to handle user input
def handle_query(query_entry, result_text):
    # Call the query function with user's query and default k value of 5
    results = query(query_entry.get())
    
    # Display the top k results in the result_text widget
    result_text.delete('1.0', tk.END)
    for i, result in enumerate(results):
        result_text.insert(tk.END, f"{i+1}. Document {result}\n")
        
# Create widgets for the search engine UI
query_label = tk.Label(window, text="Enter your query:")
query_entry = tk.Entry(window, width=50)
result_text = scrolledtext.ScrolledText(window, width=80, height=20)
query_button = tk.Button(window, text="Search", command=partial(handle_query, query_entry, result_text))

# Pack the widgets into the window
query_label.pack()
query_entry.pack()
query_button.pack()
result_text.pack()

# Run the Tkinter event loop
window.mainloop()