# CST 4802 Information Retrieval Final Project


Option #5: Build a Question-Answering System (QA)

In [None]:
%pip install wikipedia-api nltk gensim rank-bm25 sentence-transformers transformers evaluate



In [None]:
import wikipediaapi

def fetch_wikipedia_pages(topics, lang='en'):
    """
    Fetch content for multiple Wikipedia topics.
    :param topics: List of Wikipedia page titles
    :param lang: Language code ('en' for English)
    :return: Dictionary of page title -> page content
    """
    user_agent = "MyWikipediaBot/1.0 (khansilma@gmail.com)"
    wiki_wiki = wikipediaapi.Wikipedia(language=lang, user_agent=user_agent)
    pages = {}

    for topic in topics:
        page = wiki_wiki.page(topic)
        if page.exists():
            pages[topic] = page.text
            print(f"Fetched '{topic}' successfully!")
        else:
            print(f"Page '{topic}' does not exist.")

    return pages

topics = ['Information retrieval', 'Natural language processing', 'Artificial intelligence']
wiki_corpus = fetch_wikipedia_pages(topics)

Fetched 'Information retrieval' successfully!
Fetched 'Natural language processing' successfully!
Fetched 'Artificial intelligence' successfully!


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

def preprocess_text(text):
    """
    Preprocess text by cleaning and tokenizing sentences without being too aggressive.
    :param text: Raw text
    :return: List of cleaned sentences
    """
    sentences = sent_tokenize(text)

    #Clean up sentences (keep punctuation and structure)
    cleaned_sentences = []
    for sent in sentences:
        #Remove unnecessary whitespace and special characters
        clean_sent = re.sub(r'\s+', ' ', sent)  #Replace multiple spaces with a single space
        clean_sent = re.sub(r'[^\x00-\x7F]+', '', clean_sent)  #Remove non-ASCII characters
        cleaned_sentences.append(clean_sent.strip())

    return cleaned_sentences

for topic, content in wiki_corpus.items():
    print(f"Processing '{topic}'...")
    wiki_corpus[topic] = preprocess_text(content)

Processing 'Information retrieval'...
Processing 'Natural language processing'...
Processing 'Artificial intelligence'...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
from rank_bm25 import BM25Okapi

def build_bm25_index(corpus):
    """
    Build a BM25 index using the preprocessed corpus.
    :param corpus: Dictionary {title -> list of sentences}
    :return: BM25 index and corresponding sentences
    """
    sentences = []
    for title, content in corpus.items():
        sentences.extend(content)

    tokenized_corpus = [sent.split() for sent in sentences]
    # bm25 = BM25Okapi(tokenized_corpus)
    bm25 = BM25Okapi(tokenized_corpus, k1=1.5, b=0.75)  #Experiment with these values to see how the ranking changes

    return bm25, sentences

bm25, all_sentences = build_bm25_index(wiki_corpus)

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

#Load a pre-trained Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def answer_query_hybrid(query, bm25, sentences, top_n=3):
    """
    Retrieve top N answers to a query using BM25 and refine them with semantic similarity.
    :param query: User query (string)
    :param bm25: BM25 index
    :param sentences: List of sentences
    :param top_n: Number of answers to retrieve
    :return: List of top N relevant sentences
    """
    tokenized_query = query.lower().split()
    scores = bm25.get_scores(tokenized_query)
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:10]  #Get the top 10 candidates
    candidate_sentences = [sentences[i] for i in top_indices]

    query_embedding = model.encode(query, convert_to_tensor=True)
    candidate_embeddings = model.encode(candidate_sentences, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(query_embedding, candidate_embeddings)[0]

    ranked_indices = torch.argsort(similarities, descending=True)[:top_n]
    top_answers = [candidate_sentences[i] for i in ranked_indices]
    return top_answers

In [None]:
#Query for Natural language processing
query = "What is natural language processing?"
answers = answer_query_hybrid(query, bm25, all_sentences)
print("Top Answers:")
for i, ans in enumerate(answers, 1):
    print(f"{i}. {ans}")

Top Answers:
1. Major tasks in natural language processing are speech recognition, text classification, natural-language understanding, and natural-language generation.
2. Though natural language processing tasks are closely intertwined, they can be subdivided into categories for convenience.
3. Natural language understanding involves the identification of the intended semantic from the multiple possible semantics which can be derived from a natural language expression which usually takes the form of organized notations of natural language concepts.


In [None]:
#Query for information retrieval
query = "What is information retrieval?"
answers = answer_query_hybrid(query, bm25, all_sentences)
print("Top Answers:")
for i, ans in enumerate(answers, 1):
    print(f"{i}. {ans}")

Top Answers:
1. Information retrieval (IR) in computing and information science is the task of identifying and retrieving information system resources that are relevant to an information need.
2. Automated information retrieval systems are used to reduce what has been called information overload.
3. Performance and correctness measures The evaluation of an information retrieval system' is the process of assessing how well a system meets the information needs of its users.


In [None]:
#query for artoficial intelligence
query = "What is artificial intelligence?"
answers = answer_query_hybrid(query, bm25, all_sentences)
print("Top Answers:")
for i, ans in enumerate(answers, 1):
    print(f"{i}. {ans}")

Top Answers:
1. Dick considers the idea that our understanding of human subjectivity is altered by technology created with artificial intelligence.
2. Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence.
3. Several works use AI to force us to confront the fundamental question of what makes us human, showing us artificial beings that have the ability to feel, and thus to suffer.


In [None]:
test_queries = [
    {"query": "What is natural language processing?", "expected_answers": ["natural language understanding", "text classification"]},
    {"query": "What is information retrieval?", "expected_answers": ["identifying and retrieving information"]},
    {"query": "What is artificial intelligence?", "expected_answers": ["Turing test", "intelligence"]}
]

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

evaluation_model = SentenceTransformer('all-MiniLM-L6-v2')

def evaluate_model(test_queries, bm25, all_sentences, top_n=3):
    """
    Evaluate the question-answering model.
    :param test_queries: List of test queries with expected answers
    :param bm25: BM25 index
    :param all_sentences: List of sentences in the corpus
    :param top_n: Number of top answers to retrieve
    :return: Average precision, recall, and F1-score
    """
    precisions = []
    recalls = []

    for test_query in test_queries:
        query = test_query["query"]
        expected_answers = test_query["expected_answers"]

        predicted_answers = answer_query_hybrid(query, bm25, all_sentences, top_n=top_n)

        expected_embeddings = evaluation_model.encode(expected_answers, convert_to_tensor=True)
        predicted_embeddings = evaluation_model.encode(predicted_answers, convert_to_tensor=True)

        similarity_matrix = util.pytorch_cos_sim(expected_embeddings, predicted_embeddings)

        threshold = 0.7
        relevant_count = (similarity_matrix > threshold).sum().item()
        precision = relevant_count / len(predicted_answers) if predicted_answers else 0
        recall = relevant_count / len(expected_answers) if expected_answers else 0

        precisions.append(precision)
        recalls.append(recall)

    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0

    return avg_precision, avg_recall, f1_score

test_queries = [
    {"query": "What is natural language processing?", "expected_answers": ["natural language", "text classification"]},
    {"query": "What is information retrieval?", "expected_answers": ["identifying and retrieving information"]},
    {"query": "What is artificial intelligence?", "expected_answers": ["Turing test", "intelligence"]}
]

precision, recall, f1_score = evaluate_model(test_queries, bm25, all_sentences)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")

Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000
