# Basic Q&A System

### Sentence Retrival

In [79]:
import json
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import defaultdict
from numpy import multiply
from math import sqrt
from nltk.tokenize import word_tokenize
# from nltk.corpus import Wo
import os.path as path
from collections import OrderedDict
import re
import nltk
from nltk.stem.wordnet import WordNetLemmatizer as WNL
import time
from sklearn.feature_extraction import DictVectorizer
from nltk import FreqDist, DictionaryProbDist
from operator import add
import math
import numpy

In [80]:
t0 = time.time()
filename_ls = ['QA_train.json']
dataset = []
train_path = path.abspath('data/QA_train.json')

dataset = []
with open(train_path) as f:
    for line in f:
        dataset+=(json.loads(line))
print "There are totally",len(dataset),'documents'

There are totally 360 documents


In [81]:
stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)
stemmer = nltk.stem.PorterStemmer() 

def my_tokenizer(doc):
    terms = set()
    for token in nltk.word_tokenize(doc):
        if token not in stopwords: # 'in' and 'not in' operations are much faster over sets that lists
            terms.add(stemmer.stem(token.lower()))
    return list(terms)


class MostRelevantSentenceModel(object):
    def __init__(self, vectorizer, collection_matrix):
        self.vectorizer = vectorizer
        self.collection_matrix = collection_matrix
        feature_array = vectorizer.get_feature_names()
        self.features = dict()
        for index in range(len(feature_array)):
            term = feature_array[index]
            self.features[term] = index

    def predict(self, queies):
        prediction = []
        for query in queies:
            prediction.append(self.inverted_index_score(query)[0])
        return prediction

    def inverted_index_score(self, query_sent):
        """
        now we implement inverted index to handle query
        
        :param query_sent: 
        :return: 
        
        """
        query_words = my_tokenizer(query_sent)
        score = defaultdict(float)

        for w in query_words:
            try:
                col_i = self.features[w]
                inverted_ix = self.collection_matrix[:, col_i]
                for di in range(inverted_ix.shape[0]):
                    score[di] += inverted_ix[di, 0]
            except KeyError:
                pass

        ss = sorted(score.items(), key=lambda (k, v): v, reverse=True)

        if ss:
            return ss[0]
        else:
            return -1, 0

In [82]:
def build_model_and_evaluate(model, question, document, report=False):
    evaluate_row = []
    pred = model.predict(question)
    # score
    ac_index = 0
    quest_index = 0
    for pred_index, actual_index in zip(pred,document):
        drow = dict()
        if report:
            print pred_index, '\t', actual_index
        drow['question_i'] = quest_index
        drow['prediction_i'] = pred_index
        drow['actual_yi'] = actual_index
        quest_index += 1
        if pred_index == actual_index:
            drow['correctness'] = 1
            ac_index += 1
        evaluate_row.append(drow)
    accuracy = ac_index / len(document)
    #print('accuracy: ', accuracy)
    return evaluate_row, accuracy


#build model for each document collaction

for document in dataset:
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, use_idf=True,stop_words=None, tokenizer=my_tokenizer)
    document_collections_sents = document['sentences']
    tfidf_matrix = tfidf_vectorizer.fit_transform(document_collections_sents)
    document['model'] = MostRelevantSentenceModel(vectorizer=tfidf_vectorizer,collection_matrix=tfidf_matrix)
pass

#### Output with predtion and actual values

In [None]:
import csv
csv_file = open('evaluatin_result.csv', mode='w',)
fieldnames = ['document_i', 'question_i', 'prediction_i',
              'actual_yi', 'correctness', 'question', 'prediction_sentence', 'actual_y_sentence']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, )
writer.writeheader()

ddi = 0
for document in dataset:
    qX = [i['question'] for i in document['qa']]
    qy = [i['answer_sentence'] for i in document['qa']]
    model = document['model']
    table, acc = build_model_and_evaluate(model, qX, qy)
    for t in table:
        t['document_i'] = ddi
        t['question'] = qX[t['question_i']].encode('utf-8')
        t['prediction_sentence'] = document['sentences'][t['prediction_i']].encode('utf-8')
        t['actual_y_sentence'] = document['sentences'][t['actual_yi']].encode('utf-8')
        writer.writerow(t)

    ddi += 1
print 'EXEC:', time.time() - t0