# Assignment 2A, Part 3: Multifield retrieval

Implement BM25F and the Mixture of Language Models (MLM). Use two fields: title and content.

In [1]:
QUERY_FILE = "data/queries.txt"  # make sure the query file exists on this location
OUTPUT_FILE = "data/output.txt"  # output the ranking

## Load index

In [2]:
# TODO: place the indexing related code here. This may be copy-pasted from Part 1.
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk, unicodedata, re, csv, hashedindex, string, os
import re
import gzip
from bs4 import BeautifulSoup
import numpy as np
import glob
import pandas as pd
import math
from IPython.display import clear_output 
import pickle

In [3]:
def compute_stats(inverse_idx):
    TOTAL_DOC_LENGTH = 0
    for doc in inverse_idx.documents():
        TOTAL_DOC_LENGTH += inverse_idx.get_document_length(doc)
    
    AVERAGE_DOC_LENGTH =  TOTAL_DOC_LENGTH / len(inverse_idx.documents())
    NUM_DOCS = len(inverse_idx.documents())
    COLLECTION_DOC_LENGTH = 0 #total length of documents in the collection
    for docID in inverse_idx.documents():
            COLLECTION_DOC_LENGTH += inverse_idx.get_document_length(docID)

    return {
        "AVERAGE_DOC_LENGTH" : AVERAGE_DOC_LENGTH,
        "NUM_DOCS" : NUM_DOCS,
        "COLLECTION_DOC_LENGTH" : COLLECTION_DOC_LENGTH,
        "TOTAL_DOC_LENGTH" : TOTAL_DOC_LENGTH
    }

In [4]:
#inv_idx = pickle.load(open("data/index/indexer.p", "rb" )) #file inside index is a test pickle
inv_idx = {}
inv_idx_stats = {}
inv_idx['content'] = pickle.load(open("data/backup/indexer.p", "rb" ))
inv_idx_stats["content"] = compute_stats(inv_idx['content'])

In [5]:
inv_idx['title'] = pickle.load(open("data/backup/TitleIndexer.p", "rb" ))
inv_idx_stats["title"] = compute_stats(inv_idx['title'])

### Load the queries from the file

See the assignment description for the format of the query file [here](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2a#queries).

In [6]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

In [7]:
queries = load_queries(QUERY_FILE)

## Retrieval models

In [8]:
def bm25_idf(doc_containing_query_term):
    val = 1 + ((NUM_DOCS - doc_containing_query_term + 0.5)/(doc_containing_query_term + 0.5))
    return math.log(val)

def tokenize_query_text(query):
    clean_tokens = []
    query = unicodedata.normalize('NFKD', query).encode('ascii', 'ignore').decode('utf-8', 'ignore') # remove non ascii
    query = query.lower() #convert to lowercase
    query = query.translate(str.maketrans('', '', string.punctuation)) #remove punctuations
    for token in nltk.word_tokenize(query):
        # skip stop words
        if token in set(stopwords.words('english')):
            continue

        token = PorterStemmer().stem(token) # stem    
        clean_tokens.append(token)

    return clean_tokens

In [9]:
def create_collection_lm(inverse_idx,collection_doc_length):
    CLM = {}
    for term in inverse_idx.terms():
        total_term_frequency = inverse_idx.get_total_term_frequency(term)
        CLM[term] = total_term_frequency / collection_doc_length
    
    return CLM

In [10]:
CLM={}
CLM['content'] = create_collection_lm(inv_idx['content'],inv_idx_stats['content']['COLLECTION_DOC_LENGTH'])
CLM['title'] = create_collection_lm(inv_idx['title'],inv_idx_stats['title']['COLLECTION_DOC_LENGTH'])

In [11]:
def bm25F( query, k1 = 1.2, b = {"title":.75,"content":.75},number_of_results=100, field_weight = {"title":.7,"content":.3},get_score = True): 
    doc_scores = {} #stores score for each document for the provided query
    term_score_summation = 0
    for query_term in tokenize_query_text(query):
        
        for i, f_w in field_weight.items():
            
            if query_term in inv_idx[i].items():
                query_term_frequency_in_doc = len(inv_idx[i][query_term])
                idf = math.log((inv_idx_stats[i]['NUM_DOCS'] - query_term_frequency_in_doc + 0.5) / (query_term_frequency_in_doc + 0.5))
                pseudo_term_frequency={}
                for (doc_id, f_td) in inv_idx[i][query_term].items(): 
                    doc_length = inv_idx[i].get_document_length(doc_id)
                    Bi = (1 - b[i] + b[i] * (doc_length/inv_idx_stats[i]["AVERAGE_DOC_LENGTH"]))
                    pseudo_term_frequency[doc_id] = pseudo_term_frequency.get(doc_id, 0) + (f_w*(f_td/Bi))
                    term_score = (pseudo_term_frequency[doc_id]/(k1 + pseudo_term_frequency[doc_id]))
                    term_score_w_idf=round(term_score *idf,3)
                    doc_scores[doc_id] = doc_scores.get(doc_id, 0) + term_score_w_idf

    
    sorted_list = sorted(doc_scores.items(), key=lambda score: score[1], reverse = True)[:number_of_results]
    if not get_score:
        relevant_articles = []
        for x in sorted_list:
            relevant_articles.append(x[0])

        return relevant_articles
    return sorted_list

In [12]:
def MLM_JMS(query, Lamda={"title":.75,"content":.75} ,number_of_results=100, field_weight = {"title":.7,"content":.3}):
    doc_scores={}
    for query_term in tokenize_query_text(query):
        
        for i, f_w in field_weight.items():
            
            if query_term in inv_idx[i].items():
                for (doc_id, f_td) in inv_idx[i][query_term].items(): 
                    doc_length = inv_idx[i].get_document_length(doc_id)
                    
                    term_score = f_w*((1-Lamda[i])*(f_td / doc_length) + (Lamda[i]*CLM[i][query_term]))
                    doc_scores[doc_id] = doc_scores.get(doc_id, 0) + term_score
    sorted_list = sorted(doc_scores.items(), key=lambda score: score[1], reverse = True)[:number_of_results]
    
    return doc_scores

In [21]:
def save_output(results,filename=OUTPUT_FILE):
    with open(filename, mode='w') as index_file:
        csv_writer = csv.writer(index_file, delimiter=',', quotechar='"')
        csv_writer.writerow(["QueryId","DocumentId"])
        for term in results:
            for article in results[term]:
                csv_writer.writerow([term,article]) #save as counter object

In [25]:
results_bm25F = {}
for q_id, query in queries.items():
    results_bm25F[q_id] = bm25F( query, k1 = 1.2, b = {"title":0.4,"content":0.1},number_of_results=100, field_weight = {"title":.1,"content":.9},get_score = False)

print("completed ranking")
save_output(results_bm25F,"data/bm25F_output.csv")

completed ranking


In [23]:
results_MLM = {}
for q_id, query in queries.items():
    results_MLM[q_id] = MLM_JMS(query, Lamda = {"title":0.1,"content":0.1},number_of_results=100, field_weight = {"title":.1,"content":.9})

print("completed ranking")
save_output(results_MLM,"data/output_mlm_jm.csv")

completed ranking


### Perform retrieval

**TODO** Generate a ranking for each query and output the results to `OUTPUT_FILE`

See the assignment description for the format of the output file [here](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2a#output-file-format).

## Evaluation

Report on the evaluation results (using the [Evaluation notebook](1_Evaluation.ipynb)) here.

Describe the parameter settings used for the two methods and the method you used for exploring the parameter space.

Specifically, explain how did you decide on 
  - the models' parameters ($k1$ and $b$ for BM25,smoothing method and smoothing parameter for LM);
  - the field weights ($w_{title}$ and $w_{content}$).

*TODO*

Report only the best performing setting for each model in the table below. The corresponding result files should be pushed to your repository.


| **Method** | **Parameter settings** | **Output file** | **P@10** | **MAP** | **MRR** |
| -- | -- | -- | -- | -- | -- |
| BM25F | k1: *1.2*, b: *{"title":0.4,"content":.1}*, $w_{title}$: *.1*, $w_{content}$: *.9* | `data/output_bm25f.csv` | *0.228* | *0.079* | *0.327* |      
| MLM | Smoothing method: *jelinek merker*, smoothing param: *{"title":0.1,"content":0.1}*,$w_{title}$: *.1*, $w_{content}$: *.9* | `data/output_mlm_jm.csv` | *0.044* | *0.028* | *0.073* |
