# Assignment 3, Model 1: MLM

In this notebook you will implement MLM re-ranking of the first-pass ranking retrieved from your index. 

Your implementation of the mixture of language models (MLM) approach should work with two fields, `title` and `content`, with weights 0.2 and 0.8, respectively. 

Content should be the "catch-all" field. Use Dirichlet smoothing with the smoothing parameter set to 2000.

Be sure to use both markdown cells with section headings and explanations, as well as writing readable code, to make it clear what your intention is each step of the way through the code. 

In [350]:
import pandas as pd
from elasticsearch import Elasticsearch
from IPython.display import clear_output, display, HTML
import pandas as pd
import urllib,os,math
from nltk.stem import PorterStemmer
from pprint import pprint
import itertools
from collections import OrderedDict
es = Elasticsearch()
es.info()
stemmer = PorterStemmer()
indexname = "dbpedia_text"

In [274]:
QUERIES_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"
QUERIES2_FILE = "data/queries2.txt"

In [250]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            q=line.strip().split()
            query=' '.join(q[1:])
            qid=q[0]
#             qid, query = line.strip().split(" ", 1)
            queries[qid] = query
            
    return queries

In [251]:
queries = load_queries(QUERIES_FILE)

In [275]:
queries2 = load_queries(QUERIES2_FILE)

In [253]:
def load_qrels(qrels_file):
    qrels= pd.read_csv('data\qrels.csv')
    
   
    return qrels

In [254]:
qrels=load_qrels(QRELS_FILE)

In [255]:
# TODO
FIELDS = ["names", "catch_all"]
FIELD_WEIGHTS = [0.2, 0.8]
LAMBDA = 0.8
miu=2000

In [422]:
def score_mlm(es,  qterms, doc_id):
    score = 0  # log P(q|d)
    
    # Getting term frequency statistics for the given document field from Elasticsearch
    # Note that global term statistics are not needed (`term_statistics=False`)
    tv = es.termvectors(index=indexname, id=doc_id, fields=FIELDS,
                              term_statistics=True).get("term_vectors", {})
    # compute field lengths $|d_i|$
    len_d_i = []  # document field length
    for i, field in enumerate(FIELDS):
        if field in tv:
            len_d_i.append(sum([s["term_freq"] for t, s in tv[field]["terms"].items()]))
        else:  # that document field may be empty
            len_d_i.append(0) 
            
    # scoring the query
    for t in qterms:
        Pt_theta_d = 0  # P(t|\theta_d)
        for i, field in enumerate(FIELDS):
            if field in tv:
                Ft_di = tv[field]["terms"].get(t, {}).get("term_freq", 0)   # $P(t|d_i)$
#                 print(Pt_di)
            else:  # that document field is empty
                Ft_di = 0 
            if field in tv:
                
                Pt_Ci = tv[field]["terms"].get(t, {}).get('ttf',0) / tv[field]['field_statistics']['sum_ttf']
            else:
                Pt_Ci=0
#            
           
#             Pt_theta_di = (1 - LAMBDA) * Pt_di + LAMBDA * Pt_Ci  # $P(t|\theta_{d_i})$ with J-M smoothing
            Pt_theta_di = (Ft_di + (miu * Pt_Ci))/(len_d_i[i]+miu)
            Pt_theta_d += FIELD_WEIGHTS[i] * Pt_theta_di
        score += Pt_theta_d  
    
    return score
# score_mlm(es,['szechwan','dish','food','cuisine'],'<dbpedia:Indian_Chinese_cuisine>')

In [329]:
sliced=dict(itertools.islice(queries.items(), 2))

In [375]:
#MLM score  for queries 1
scores = {}
for qid,query in sorted(queries.items()):    
    dscores={}
    query1='Szechwan dish food cuisine' 
    stemmed_query=stemmer.stem(query.lower())
    qterms=stemmed_query.split()
        # get document IDS
    res = es.search(indexname, q=stemmed_query, size=100)['hits']['hits']
#     clm = CollectionLM(es, qterms)
    for doc in res:
        doc_id=doc['_id']
        doc_score = score_mlm(es, qterms, doc_id)
        if doc_score == 0:
            continue
        else:
            dscores[doc_id]=doc_score
    scores[qid]=dscores

In [436]:
final_dic = {}
for i in scores:
    final_dic[i] = sorted(scores[i].items(), key=lambda kv: kv[1], reverse=True)[:100]

In [437]:
#keeping only QueryID and EntityID
df_list = []
for i in final_dic:
#     print(i)
    for x in range(len(final_dic[i])):
        df_list.append([i, final_dic[i][0][0]])

In [438]:
# df_list

In [439]:
df=pd.DataFrame(df_list,columns=['QueryId','EntityId'])

In [440]:
df.to_csv('data/ranking_model1.csv',index=False)

In [423]:
#MLM score for queries 2
scores2 = {}
for qid,query in sorted(queries2.items()):    
    dscores={}
    stemmed_query=stemmer.stem(query.lower())
    qterms=stemmed_query.split()
        # get document IDS
    res = es.search(indexname, q=stemmed_query, size=100)['hits']['hits']
#     clm = CollectionLM(es, qterms)
    for doc in res:
        doc_id=doc['_id']
        doc_score = score_mlm(es, qterms, doc_id)
        if doc_score == 0:
            continue
        else:
            dscores[doc_id]=doc_score
    scores2[qid]=dscores

In [425]:
# scores2

In [441]:
final_dic2 = {}
for i in scores2:
    final_dic2[i] = sorted(scores2[i].items(), key=lambda kv: kv[1], reverse=True)[:100]

In [442]:
# final_dic2

In [443]:
#keeping only QueryID and EntityID
df_list2 = []
for i in final_dic2:
#     print(i)
    for x in range(len(final_dic2[i])):
        df_list2.append([i, final_dic2[i][0][0]])

In [444]:
# df_list2

In [446]:
mlm_prediction = pd.DataFrame(df_list2, columns=['QueryId','EntityId']).to_csv('data/ranking_model2.csv',index=False)

The resulting rankings for the two query sets should be saved and pushed to GitHub as `data/ranking_model1.csv` and `data/ranking2_model1.csv`.