In [1]:
import re

import pandas as pd
import numpy as np
import nltk
from rank_bm25 import *
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim.parsing.preprocessing import STOPWORDS
from rank_eval import Qrels, Run, evaluate
from tqdm import tqdm
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/shamim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def stemming(lst):
    lst1=list()
    ps = PorterStemmer()
    for element in lst:
        word_list = [ps.stem(w) for w in element.split()]
        str_t = " ".join(word_list)
        lst1.append(str_t)
    return lst1

In [3]:
def stopwprds_removal(lst):
    lst1=list()
    for str in lst:
        text_tokens = word_tokenize(str)
        tokens_without_sw = [word for word in text_tokens if not word in STOPWORDS]
        str_t = " ".join(tokens_without_sw)
        lst1.append(str_t)
 
    return lst1

In [4]:
def spl_chars_removal(lst):
    lst1=list()
    for element in lst:
        str=""
        str = re.sub("[^0-9a-zA-Z]"," ",element)
        lst1.append(str)
    return lst1

In [5]:
path = "/home/shamim/Documents/MS_Life/CS839/project/dataset/"

In [6]:
train_df = pd.read_csv(path+"new_train.csv")
test_df = pd.read_csv(path+"new_test.csv")

In [7]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,url,question,answer,long_answer,final_answer
0,0,0,http://www.freebase.com/view/en/justin_bieber,what is the name of justin bieber brother?,Jazmyn Bieber,"Justin Bieber has three half-siblings, includi...",Jazmyn Bieber Justin Bieber has three half-sib...
1,1,1,http://www.freebase.com/view/en/natalie_portman,what character did natalie portman play in sta...,Padmé Amidala,Padme Amidala,Padmé Amidala Padme Amidala
2,2,2,http://www.freebase.com/view/en/selena_gomez,what state does selena gomez?,New York City,"Selena Gomez is an actress, singer and songwri...","New York City Selena Gomez is an actress, sing..."
3,3,3,http://www.freebase.com/view/en/grand_bahama,what country is the grand bahama island in?,Bahamas,The Grand Bahama Island is a part of the Bahamas.,Bahamas The Grand Bahama Island is a part of t...
4,4,4,http://www.freebase.com/view/en/the_bahamas,what kind of money to take to bahamas?,Bahamian dollar,This question is incomplete. Please provide a ...,Bahamian dollar This question is incomplete. P...


In [8]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,id,url,question,answer,long_answer,final_answer
0,0,0,http://www.freebase.com/view/en/jamaica,what does jamaican people speak?,Jamaican Creole English Language,"Jamaica has its own dialect known as ""Jamiean""...",Jamaican Creole English Language Jamaica has i...
1,1,1,http://www.freebase.com/view/en/james_k_polk,what did james k polk do before he was president?,Lawyer,James K. Polk was a lawyer and politician who ...,Lawyer James K. Polk was a lawyer and politici...
2,2,2,http://www.freebase.com/view/en/oregon_ducks,what is the oregon ducks 2012 football schedule?,University of Oregon,The Oregon Ducks played at USC on September 7t...,University of Oregon The Oregon Ducks played a...
3,3,3,http://www.freebase.com/view/en/ken_barlow,who plays ken barlow in coronation street?,Tony Warren,William Roache has played the role of Ken Barl...,Tony Warren William Roache has played the role...
4,4,4,http://www.freebase.com/view/en/chiune_sugihara,what happened after mr. sugihara died?,Yaotsu,"After Harukichi Sugiura passed away in 1935, h...",Yaotsu After Harukichi Sugiura passed away in ...


In [27]:
removed_special_char = spl_chars_removal(test_df["final_answer"])
stop_words_removed = stopwprds_removal(removed_special_char)
corpus = stemming(stop_words_removed)
tokenized_corpus = [doc.split(" ") for doc in corpus]

In [38]:
bm25 = BM25Okapi(tokenized_corpus)

In [39]:
def prepare_Qrels(size):
    true_q_ids = [str(i) for i in range(size)]
    true_doc_ids = [[str(i)] for i in range(size)]
    true_scores = [[100] for _ in range(size)]
    qrels = Qrels()
    qrels.add_multi(q_ids=true_q_ids,doc_ids=true_doc_ids,scores=true_scores)
    return qrels

In [40]:
qrels = prepare_Qrels(len(test_df))

In [41]:
top_n = 5
run = Run()

predicted_q_ids = []
predicted_doc_ids = []
predicted_scores = []

for i,q in enumerate(test_df["question"]):
    removed_special_char = spl_chars_removal([q])
    stop_words_removed = stopwprds_removal(removed_special_char)
    stemmed_query = stemming(stop_words_removed)
    tokenized_query = stemmed_query[0].split(" ")
    predicted_q_ids.append(str(i))
    
    docs = bm25.get_top_n(tokenized_query, corpus, n=top_n)
    predicted_doc_ids.append([str(corpus.index(d)) for d in docs])

    doc_scores = bm25.get_scores(tokenized_query) # return scores for each document in the corpus
    sorted_score = np.sort(doc_scores)[::-1][:top_n]
    predicted_scores.append(sorted_score.tolist())
    
run.add_multi(q_ids=predicted_q_ids, doc_ids=predicted_doc_ids, scores=predicted_scores)

In [32]:
#BM25L
evaluate(qrels, run, ["map@5", "mrr", "ndcg@5", "precision", "recall"])

{'map@5': 0.3889763779527559,
 'mrr': 0.3889763779527559,
 'ndcg@5': 0.4330061444011331,
 'precision': 0.11318897637795274,
 'recall': 0.5654527559055118}

In [37]:
#BM25Plus
evaluate(qrels, run, ["map@5", "mrr", "ndcg@5", "precision", "recall"])

{'map@5': 0.5513533464566929,
 'mrr': 0.5513533464566929,
 'ndcg@5': 0.5823280250817667,
 'precision': 0.13551509186351707,
 'recall': 0.6747047244094488}

In [42]:
#BM25OKAPI
evaluate(qrels, run, ["map@5", "mrr", "ndcg@5", "precision", "recall"])

{'map@5': 0.5505249343832022,
 'mrr': 0.5505249343832022,
 'ndcg@5': 0.5813440395989641,
 'precision': 0.13521981627296586,
 'recall': 0.6732283464566929}

In [48]:
# run.mean_scores

In [24]:
# dict(run.scores)