# Basic Q&A System

### Sentence Retrival

In [1]:
import math,numpy,json,re,nltk,csv,time,re,os.path,sys,gensim,ast
from gensim import corpora
from operator import add
from nltk.tokenize import word_tokenize
from nltk.tag import StanfordNERTagger
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

#from sklearn import metrics
#from math import sqrt
#from collections import OrderedDict
#from nltk.stem.wordnet import WordNetLemmatizer as WNL
#from sklearn.feature_extraction import DictVectorizer
#from nltk import FreqDist, DictionaryProbDist


#### Load data from json file

In [2]:
t0 = time.time()
filename_ls = ['QA_dev.json']
dataset = []
train_path = os.path.abspath('data/QA_dev.json')

dataset = []
with open(train_path) as f:
    for line in f:
        dataset+=(json.loads(line))
print "Import Successful "
print "There are totally", len(dataset),'documents in this dataset'

Import Successful 
There are totally 40 documents in this dataset


#### Build modle and evaluate the result

In [3]:
stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)
stemmer = nltk.stem.PorterStemmer() 

def my_tokenizer(doc):
    terms = set()
    for token in nltk.word_tokenize(doc):
        if token not in stopwords: # 'in' and 'not in' operations are much faster over sets that lists
            terms.add(stemmer.stem(token.lower()))
    return list(terms)


class MostRelevantSentenceModel:
    def __init__(self, vectorizer, collection_matrix):
        self.vectorizer = vectorizer
        self.collection_matrix = collection_matrix
        feature_array = vectorizer.get_feature_names()
        self.features = dict()
        for index in range(len(feature_array)):
            term = feature_array[index]
            self.features[term] = index

    def predictTop10(self, queies):
        predictions = [self.inverted_index_score(i) for i in  queies]
        return predictions

    def inverted_index_score(self, query_sent):
        """
        now we implement inverted index to handle query
        
        :param query_sent: 
        :return: 
        
        """
        query_words = my_tokenizer(query_sent)
        score = defaultdict(float)

        for w in query_words:
            try:
                col_i = self.features[w]
                inverted_ix = self.collection_matrix[:, col_i]
                for di in range(inverted_ix.shape[0]):
                    score[di] += inverted_ix[di, 0]
            except KeyError:
                pass

        index_score = sorted(score.items(), key=lambda (k, v): v, reverse=True)

        if index_score:
            top10_doc_index = [i[0] for i in index_score[:10]]
            return top10_doc_index
        else:
            print 'error occured' ,query_sent
            return -1, 0
        
#build model for each document collaction
for document in dataset:
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, use_idf=True,norm='l1',stop_words=None, tokenizer=my_tokenizer)
    document_collections_sents = document['sentences']
    tfidf_matrix = tfidf_vectorizer.fit_transform(document_collections_sents)
    document['model'] = MostRelevantSentenceModel(vectorizer=tfidf_vectorizer,collection_matrix=tfidf_matrix)

#### Output with predtion and actual values

In [4]:
def build_model_and_evaluate(model, query ,report=False):
    evaluate_row = []
    pred = model.predictTop10(query)
    quest_index = 0
    for pred_index in pred:
        drow = dict()
        if report:
            print pred_index
        drow['question_ID'] = quest_index
        drow['prediction_ID'] = pred_index
        evaluate_row.append(drow)
        quest_index += 1
    return evaluate_row

csv_file = open('evaluatin_dev_results.csv', mode='w',)
fieldnames = ['document_ID', 'question_ID','question','prediction_ID','prediction_sentence']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, )
writer.writeheader()

ddi = 0
for document in dataset:
    questions = [i['question'] for i in document['qa']]
    model = document['model']
    result_row = build_model_and_evaluate(model, questions)
    doc_sents = document['sentences']
    for r in result_row:
        r['document_ID'] = ddi
        r['question'] = questions[r['question_ID']].encode('utf-8')
        r['prediction_sentence'] = doc_sents[r['prediction_ID'][0]].encode('utf-8')
        writer.writerow(r)
    ddi += 1

error occured Who was the runner up?
error occured Who was the runner up?
error occured Where did it open?
error occured What did the actof of milno do?
error occured What is an Etsudiantinas? 
error occured What is crosspicking?


#### Build BM25 model

In [5]:
class BM25:
    def __init__(self, fn_docs, delimiter='|') :
        self.dictionary = corpora.Dictionary()
        self.DF = {}
        self.delimiter = delimiter
        self.DocTF = []
        self.DocIDF = {}
        self.N = 0
        self.DocAvgLen = 0
        self.fn_docs = fn_docs
        self.DocLen = []
        self.buildDictionary()
        self.TFIDF_Generator()

    def buildDictionary(self) :
        raw_data = []
        for line in self.fn_docs:
            raw_data.append(line.strip().split(self.delimiter))
        self.dictionary.add_documents(raw_data)

    def TFIDF_Generator(self, base=math.e) :
        docTotalLen = 0
        for line in self.fn_docs:
            doc = line.strip().split(self.delimiter)
            docTotalLen += len(doc)
            self.DocLen.append(len(doc))
            bow = dict([(term, freq*1.0/len(doc)) for term, freq in self.dictionary.doc2bow(doc)])
            for term, tf in bow.items() :
                if term not in self.DF :
                    self.DF[term] = 0
                self.DF[term] += 1
            self.DocTF.append(bow)
            self.N = self.N + 1
        for term in self.DF:
            self.DocIDF[term] = math.log((self.N - self.DF[term] +0.5) / (self.DF[term] + 0.5), base)
        self.DocAvgLen = docTotalLen / self.N

    def BM25Score(self, Query=[], k1=1.5, b=0.75, k3 =0) :
        query_bow = self.dictionary.doc2bow(Query)
        query_freq =dict(query_bow)
        scores = []
        for idx, doc in enumerate(self.DocTF) :
            commonTerms = set(dict(query_bow).keys()) & set(doc.keys())
            tmp_score = []
            doc_terms_len = self.DocLen[idx]
            for term in commonTerms :
                upper = (doc[term] * (k1+1))
                below = ((doc[term]) + k1*(1 - b + b*doc_terms_len/self.DocAvgLen))
                frq_q_t = query_freq[term]
                tmp_score.append(self.DocIDF[term] * upper / below * (k3 +1)*frq_q_t/(k3+frq_q_t))
            scores.append(sum(tmp_score))
        sorted_scores =sorted(list(enumerate(scores)), key=lambda (k,v): v, reverse=True)
        return  sorted_scores

    def TFIDF(self) :
        tfidf = []
        for doc in self.DocTF :
            doc_tfidf  = [(term, tf*self.DocIDF[term]) for term, tf in doc.items()]
            doc_tfidf.sort()
            tfidf.append(doc_tfidf)
        return tfidf

    def Items(self) :
        # Return a list [(term_idx, term_desc),]
        items = self.dictionary.items()
        items.sort()
        return items
    
    def predictTop10(self,query,k1=1.5, b=0.75, k3 =0):
        query = query.split()
        socres = self.BM25Score(query,k1,b,k3)
        if socres:
            socres = [i[0] for i in socres[:10]]
            return socres
        else:
            print 'error occured' ,query
            return -1, 0
for document in dataset:
    document_collections_sents = document['sentences']
    document['model'] = BM25(document_collections_sents, delimiter=' ')

In [6]:
def model_evaluation_query(model, questions ,report=False):
    evaluate_row = []
    quest_index = 0
    for index,query in enumerate(questions):
        preditons = model.predictTop10(query)
        drow = dict()
        if report:
            print preditons
        drow['question_ID'] = quest_index
        drow['prediction_ID'] = preditons
        evaluate_row.append(drow)
        quest_index += 1
    return evaluate_row
csv_file = open('BM25_dev_results.csv', mode='w',)
fieldnames = ['document_ID', 'question_ID','question','prediction_ID','prediction_sentence']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()

ddi = 0
for document in dataset:
    questions = [i['question'] for i in document['qa']]
    model = document['model']
    result_row = model_evaluation_query(model,questions)
    doc_sents = document['sentences']
    for r in result_row:
        r['document_ID'] = ddi
        r['question'] = questions[r['question_ID']].encode('utf-8')
        r['prediction_sentence'] = doc_sents[r['prediction_ID'][0]].encode('utf-8')
        writer.writerow(r)
    ddi += 1
    
print 'Time counts for running models :', time.time() - t0

Time counts for running models : 203.21600008


#### Import Prdictions from CSV file

In [7]:
csv_file = 'evaluatin_dev_results.csv'
TF_IDF_preds = []
with open(csv_file) as csvfile:
    readCSV = csv.DictReader(csvfile, delimiter=',')
    for row in readCSV:
        tempDict = {}
        tempDict['DocID'] = int(row['document_ID'])
        tempDict['Predictions'] = ast.literal_eval(row['prediction_ID'])
        tempDict['QuestionIndex'] = int(row['question_ID'])
        tempDict['Question'] = row['question']
        TF_IDF_preds.append(tempDict)

In [8]:
csv_file = 'BM25_dev_results.csv'
sentence_BM25_predictons = []
BM25_preds = []
with open(csv_file) as csvfile:
    readCSV = csv.DictReader(csvfile, delimiter=',')
    for row in readCSV:
        tempDict = {}
        tempDict['DocID'] = int(row['document_ID'])
        tempDict['Predictions'] = ast.literal_eval(row['prediction_ID'])
        tempDict['QuestionIndex'] = int(row['question_ID'])
        tempDict['Question'] = row['question']
        BM25_preds.append(tempDict)

### comparsion of results

In [10]:
count = 0
bingo = 0
for d in TF_IDF_preds:
    guess = d['Predictions'] 
    doc_i = d['DocID']
    qus_i = d['QuestionIndex']
    act_i = dataset[doc_i]['qa'][qus_i]['answer_sentence']
    if act_i in guess:
        bingo += 1
    count += 1
print float(bingo)/float(count)

0.816613494033


In [11]:
count = 0
bingo = 0
for d in BM25_preds:
    guess = d['Predictions'] 
    doc_i = d['DocID']
    qus_i = d['QuestionIndex']
    act_i = dataset[doc_i]['qa'][qus_i]['answer_sentence']
    if act_i in guess:
        bingo += 1
    count += 1
print float(bingo)/float(count)

0.600520340587


### Entity Extraction

In [None]:
def json_load_byteified(file_handle):
    return _byteify(json.load(file_handle, object_hook=_byteify),ignore_dicts=True)

def _byteify(data, ignore_dicts = False):
    # if this is a unicode string, return its string representation
    if isinstance(data, unicode):
        return data.encode('utf-8')
    # if this is a list of values, return list of byteified values
    if isinstance(data, list):
        return [ _byteify(item, ignore_dicts=True) for item in data ]
    # if this is a dictionary, return dictionary of byteified keys and values
    # but only if we haven't already byteified it
    if isinstance(data, dict) and not ignore_dicts:
        return {
            _byteify(key, ignore_dicts=True): _byteify(value, ignore_dicts=True)
            for key, value in data.iteritems()
        }
    # if it's anything else, return it in its original form
    return data

with open("data/QA_test.json") as json_file:
    json_data = json_load_byteified(json_file)
print 'import success'

In [None]:
cwd = os.getcwd()
st = StanfordNERTagger(cwd+'\data\english.all.3class.distsim.crf.ser.gz',cwd+'\data\stanford-ner.jar')

if not os.path.isfile("NERtest.json"):    
    start = time.time()
    progressT = len(json_data)    
    listOfDocument=[]
    i=0
    for jd in json_data:
        aList=[]        
        aList.append(st.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x['question']))) for x in jd['qa']]))
        #remove the below file if running on test set
        #aList.extend([st.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x['answer']))) for x in jd['qa']])])
        aList.append(st.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x))) for x in jd['sentences']]))
        listOfDocument.append(aList)
        i+=1
        sys.stdout.write('\r')
        sys.stdout.write("%d%%" % (i*100/progressT))
        sys.stdout.flush()    
    for document in range(0,len(listOfDocument)):
        #change [2] to [1] if test set
        for sentence in range(0,len(listOfDocument[document][1])):
            for word in range(0,len(listOfDocument[document][1][sentence])):   
                listOfDocument[document][1][sentence][word]= (listOfDocument[document][1][sentence][word][0],listOfDocument[document][1][sentence][word][1] if not listOfDocument[document][1][sentence][word][0].isdigit() else u'NUMBER')
    with open('NERtest.json', 'w') as outfile:
        json.dump(listOfDocument, outfile)
    end = time.time()
    print '\nTime spending:',end - start    
else:    
    print 'there is a file'
with open("NERtest.json") as json_file:
        json_dataNER = json_load_byteified(json_file)