# Task 4

In [7]:
import numpy as np
import pandas as pd

In [8]:
from task1 import *
from task2 import *
from task3 import *

## Prepare Data

In [9]:
vocab = get_vocab()
candidate_passages, pid_passage_dict = passage_to_id()
inverted_index = get_inverted_index(vocab, pid_passage_dict)

vocabulary loaded!
pid-to-passage mapped!


100%|██████████████████████████████████| 182469/182469 [03:26<00:00, 883.40it/s]


In [10]:
test_queries, qid_query_dict = query_to_id()

## Query likelihood Language Models

Use **test-queries.tsv** and **candidate-passages-top1000.tsv** for this task. 

Implement the query likelihood language model with (a) **Laplace smoothing**, (b) **Lidstone correction** with ϵ = 0.1, and (c) **Dirichlet smoothing** with µ = 50, and retrieve **100 passages** from within the 1000 candidate passages for
**each test query**. 

Store the respective outcomes in the files **laplace.csv, lidstone.csv, and dirichlet.csv**. In these files, the **column score** should report the **natural logarithm of the probability scores**.

In [11]:
def implement_language_model(test_queries, qid_query_dict, pid_passage_dict, inverted_index, model_params, top=100):
    
    # initialization
    df = pd.DataFrame()
    qids, pids, scores = np.array([]), np.array([]), np.array([])
    
    # vocabulary size of passages
    V = len(inverted_index)
    
    # total number of terms in passages
    N = np.sum([len(passage) for pid, passage in pid_passage_dict.items()])
    
    for qid, query in tqdm(test_queries.values):
        
        query = qid_query_dict[qid]
        qids_i, pids_i, scores_i = np.array([]), np.array([]), np.array([])
        
        for pid in candidate_passages[candidate_passages['qid']==qid]['pid']:
            
            passage = pid_passage_dict[pid]
            
            D = len(passage)    # passage length
            
            score = 0
            for term in query:
                # term frequency in passage and total occurance
                m = inverted_index[term][pid][0] if term in passage else 0
                # compute the score
                if model_params['model'] == 'laplace':
                    num = (m+1)
                    dom = (D+V)
                elif model_params['model'] == 'lidstone':
                    num = (m+model_params['epsilon'])
                    dom = (D+V*model_params['epsilon'])
                elif model_params['model'] == 'dirichlet':
                    # compute the background probability of the term
                    if term in inverted_index:
                        cqi = np.sum([v[0] for k,v in inverted_index[term].items()])
                    else:
                        cqi = 0
                    p_term = cqi / N
                    num = (m+model_params['mu']*p_term)
                    dom = (D+model_params['mu'])
                # check whether there exists a zero probability
                if num/dom == 0:
                    score = -float('inf')
                    break
                else:
                    score += np.log(num/dom)
                
            qids_i = np.append(qids_i, qid)
            pids_i = np.append(pids_i, pid)
            scores_i = np.append(scores_i, score)
            
        descending_idx = np.argsort(scores_i)[::-1]
        qids_i = qids_i[descending_idx]
        pids_i = pids_i[descending_idx]
        scores_i = scores_i[descending_idx]
        
        qids = np.append(qids, qids_i[:top])
        pids = np.append(pids, pids_i[:top])
        scores = np.append(scores, scores_i[:top])
        
    df['qid'] = qids
    df['pid'] = pids
    df['score'] = scores
    
    df['qid'] = df['qid'].astype(int)
    df['pid'] = df['pid'].astype(int)
        
    return df

In [12]:
model_params = {'model': 'laplace',
                'epsilon': 0.1,
                'mu': 50}

df_laplace = implement_language_model(test_queries, qid_query_dict, pid_passage_dict, inverted_index, model_params, top=100)
df_laplace.head()

100%|█████████████████████████████████████████| 200/200 [00:03<00:00, 65.11it/s]


Unnamed: 0,qid,pid,score
0,1108939,3647358,-28.923688
1,1108939,3899060,-29.260348
2,1108939,7919347,-29.300231
3,1108939,242056,-29.36462
4,1108939,4390191,-29.432824


In [13]:
len(df_laplace)

19290

In [14]:
df_laplace.to_csv('laplace.csv', header=False, sep=',', index=False)

In [15]:
model_params = {'model': 'lidstone',
                'epsilon': 0.1,
                'mu': 50}

df_lidstone = implement_language_model(test_queries, qid_query_dict, pid_passage_dict, inverted_index, model_params, top=100)
df_lidstone.head()

100%|█████████████████████████████████████████| 200/200 [00:03<00:00, 63.89it/s]


Unnamed: 0,qid,pid,score
0,1108939,6707713,-23.400292
1,1108939,2068541,-23.472331
2,1108939,8596285,-23.507025
3,1108939,3130232,-23.687222
4,1108939,6758075,-24.080049


In [16]:
len(df_lidstone)

19290

In [17]:
df_lidstone.to_csv('lidstone.csv', header=False, sep=',', index=False)

In [18]:
model_params = {'model': 'dirichlet',
                'epsilon': 0.1,
                'mu': 50}

df_dirichlet = implement_language_model(test_queries, qid_query_dict, pid_passage_dict, inverted_index, model_params, top=100)
df_dirichlet.head()

100%|█████████████████████████████████████████| 200/200 [03:21<00:00,  1.01s/it]


Unnamed: 0,qid,pid,score
0,1108939,3130232,-10.43321
1,1108939,8596285,-10.449774
2,1108939,2068541,-10.48902
3,1108939,6707713,-10.515604
4,1108939,2846462,-10.796922


In [19]:
len(df_dirichlet)

19290

In [20]:
df_dirichlet.to_csv('dirichlet.csv', header=False, sep=',', index=False)