<a href="https://colab.research.google.com/github/sonia73b/tech400asst/blob/main/W4ASST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
from collections import defaultdict
import math
import numpy as np

In [None]:
# Preprocessing function
def preprocess(text):
    return re.findall(r'\b\w+\b', text.lower())

In [None]:
# Load documents
def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                docs[filename] = preprocess(file.read())
    return docs

In [None]:
# Load queries
def load_queries(query_file_path):
    with open(query_file_path, 'r') as file:
        return [line.strip() for line in file.readlines()]

In [13]:
def compute_statistics(docs):
    term_freq = defaultdict(lambda: defaultdict(int))
    term_doc_freq = defaultdict(int)
    collection_freq = defaultdict(int) # Total times a term appears in the whole corpus
    doc_lengths = {} # Length of each document
    total_corpus_len = 0

    for doc_id, tokens in docs.items():
        doc_lengths[doc_id] = len(tokens)
        total_corpus_len += len(tokens)

        # Count terms
        unique_tokens = set(tokens)
        for token in tokens:
            term_freq[doc_id][token] += 1
            collection_freq[token] += 1

        for token in unique_tokens:
            term_doc_freq[token] += 1

    num_docs = len(docs)
    avg_doc_length = total_corpus_len / num_docs if num_docs > 0 else 0

    return term_freq, term_doc_freq, collection_freq, doc_lengths, avg_doc_length, total_corpus_len, num_docs

In [14]:
def compute_bm25_score(query, doc_id, docs, term_freq, term_doc_freq, doc_lengths, avg_doc_length, num_docs):
    k1 = 1.2
    b = 0.75
    score = 0.0
    doc_len = doc_lengths[doc_id]

    for term in query:
        if term not in docs[doc_id]:
            continue

        tf = term_freq[doc_id][term]
        df = term_doc_freq.get(term, 0)

        # Inverse Document Frequency (IDF)
        idf = math.log(1 + (num_docs - df + 0.5) / (df + 0.5))

        # Term Frequency saturation
        numerator = tf * (k1 + 1)
        denominator = tf + k1 * (1 - b + b * (doc_len / avg_doc_length))

        score += idf * (numerator / denominator)

    return score

In [15]:
def compute_lm_jm_score(query, doc_id, docs, term_freq, collection_freq, doc_lengths, total_corpus_len):
    lambda_param = 0.7  # Smoothing parameter
    score = 0.0
    doc_len = doc_lengths[doc_id]

    for term in query:
        # P(t|D): Probability of term in document
        tf = term_freq[doc_id].get(term, 0)
        p_t_d = tf / doc_len if doc_len > 0 else 0

        # P(t|C): Probability of term in collection
        cf = collection_freq.get(term, 0)
        p_t_c = cf / total_corpus_len if total_corpus_len > 0 else 0

        # JM Smoothing: Mix the two probabilities
        smoothed_prob = (lambda_param * p_t_d) + ((1 - lambda_param) * p_t_c)

        # Log probability to avoid underflow
        if smoothed_prob > 0:
            score += math.log(smoothed_prob)
        else:
            score += -20 # Penalty for zero probability

    return score

In [16]:
def retrieve_documents_and_result(trump_speeches_path, query_file_path, output_file_name):
    docs = load_documents(trump_speeches_path)
    queries = load_queries(query_file_path)

    # UNPACKING FIXED: Now accepting all 7 values returned by compute_statistics
    term_freq, term_doc_freq, collection_freq, doc_lengths, avg_doc_len, total_corpus_len, num_docs = compute_statistics(docs)

    with open(output_file_name, 'w') as results_file:
        for query in queries:
            query_terms = preprocess(query)
            if not query_terms:
                continue

            # Calculate BM25 Scores
            bm25_scores = []
            for doc_id in docs:
                score = compute_bm25_score(query_terms, doc_id, docs, term_freq, term_doc_freq, doc_lengths, avg_doc_len, num_docs)
                bm25_scores.append((doc_id, score))
            bm25_scores.sort(key=lambda x: x[1], reverse=True)

            # Calculate LM-JM Scores
            lm_scores = []
            for doc_id in docs:
                score = compute_lm_jm_score(query_terms, doc_id, docs, term_freq, collection_freq, doc_lengths, total_corpus_len)
                lm_scores.append((doc_id, score))
            lm_scores.sort(key=lambda x: x[1], reverse=True)

            # Write Combined Results
            results_file.write(f"Query: {query}\n")
            results_file.write("-" * 40 + "\n")

            results_file.write("Results (Okapi BM25):\n")
            for doc_id, score in bm25_scores:
                results_file.write(f"  Document: {doc_id}, Score: {score:.4f}\n")

            results_file.write("\nResults (LM with Jelinek-Mercer):\n")
            for doc_id, score in lm_scores:
                results_file.write(f"  Document: {doc_id}, Score: {score:.4f}\n")

            results_file.write("\n" + "="*40 + "\n\n")

    print(f"Results written to {output_file_name}")

In [12]:
# Main function
def main():
    trump_speeches_path = '/content/drive/MyDrive/TECH400TXTFILES'
    query_file_path = '/content/drive/MyDrive/TECH400TXTFILES/queries1.txt'
    output_file_name = 'resultsBM25.txt'
    retrieve_documents_and_result(trump_speeches_path, query_file_path, output_file_name)

if __name__ == "__main__":
    main()

Results written to resultsBM25.txt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Sample of Retrieval Results**

| Query | Top Doc (BM25) | Score (BM25) | Top Doc (LM-JM) | Score (LM-JM) |
| :--- | :--- | :--- | :--- | :--- |
| "to" | speech_30.txt | 0.0192 | speech_13.txt | -3.1941 |
| "america strong" | speech_2.txt | 0.5943 | speech_13.txt | -11.6011 |
| "to bring us" | speech_36.txt | 0.4018 | speech_3.txt | -15.7044 |
| "white" | speech_49.txt | 0.8017 | speech_49.txt | -6.6489 |
| "future i" | speech_20.txt | 0.6607 | speech_11.txt | -9.9884 |
| "victory and" | speech_27.txt | 1.5360 | speech_27.txt | -9.7200 |