# Basic Q&A System

### Sentence Retrival

In [1]:
import math,numpy,json,re,nltk,csv,time,re,os.path,sys,gensim,ast,itertools
from gensim import corpora
from operator import add
from nltk.tokenize import word_tokenize
from nltk.tag import StanfordNERTagger
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from nltk import FreqDist, DictionaryProbDist

#from sklearn import metrics
#from math import sqrt
#from collections import OrderedDict
#from nltk.stem.wordnet import WordNetLemmatizer as WNL
#from sklearn.feature_extraction import DictVectorizer



#### Load data from json file

In [2]:
t0 = time.time()
filename_ls = ['QA_dev.json']
dataset = []
train_path = os.path.abspath('data/QA_dev.json')

dataset = []
with open(train_path) as f:
    for line in f:
        dataset+=(json.loads(line))
print "Import Successful "
print "There are totally", len(dataset),'documents in this dataset'

Import Successful 
There are totally 40 documents in this dataset


#### Build tf-idf model 

In [3]:
stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)
stemmer = nltk.stem.PorterStemmer() 

def my_tokenizer(doc):
    terms = set()
    for token in nltk.word_tokenize(doc):
        if token not in stopwords and token.isalpha(): # 'in' and 'not in' operations are much faster over sets that lists
            terms.add(stemmer.stem(token.lower()))
    return list(terms)

#### Build BM25 model

In [5]:
class BM25_Model(object): 
    def __init__(self, document_collection, k1=1.5, b=0.75, k3=1.0, EPS=0.25, tokenizer=my_tokenizer): 
        self.tokenizer = tokenizer 
        self.document_collection_length = len(document_collection) 
        self.avg_doc_length = sum(map(lambda x: float(len(x)), document_collection)) / self.document_collection_length 
        self.document_collection = [self.tokenizer(doc) for doc in document_collection] 
        self.tf = [] 
        self.df = defaultdict(int) 
        self.bm25_idf = defaultdict(float) 
        self.average_idf = -1 
        self.k1 = k1 
        self.k3 = k3 
        self.EPSILON = EPS 
        self.b = b 
        self.inverted_index = defaultdict(list) 
        self.initialize() 
    def initialize(self):
        for index, document in enumerate(self.document_collection): 
            doc_term_freq = FreqDist(document) 
            self.tf.append(doc_term_freq) 
            
            for word, freq in doc_term_freq.items(): 
                self.df[word] += 1 
                self.inverted_index[word].append(index) 
        for word, freq in self.df.items(): 
            self.bm25_idf[word] = math.log(self.document_collection_length - freq + 0.5) - math.log(freq + 0.5)  
            self.average_idf = sum(map(lambda k: float(self.bm25_idf[k]), self.bm25_idf.keys())) / len(self.bm25_idf.keys()) 
    
    def predict(self, queryX, limit=1): 
        q_prediction = [] 
        for query in queryX: 
            answers = self.bm25_get_most_relevant(query)[:limit] 
            q_prediction.append([i[0] for i in answers]) 
        return q_prediction 
 

    def bm25_get_most_relevant(self, query): 
        query_tks = self.tokenizer(query) 
        scores = defaultdict(float) 
        for q_token in query_tks: 
            for doc_index in self.inverted_index[q_token]: 
                idf = self.bm25_idf[q_token] if self.bm25_idf[q_token] >= 0 else self.EPSILON * self.average_idf 
                top = self.tf[doc_index][q_token] * (self.k1 + 1) 
                below = self.tf[doc_index][q_token] + self.k1 * (1 - self.b + self.b * self.document_collection_length / self.avg_doc_length) 
                scores[doc_index] += idf * top / below 
        prels = scores.items() 
        sorted_socres = sorted(prels, key=lambda (k, v): v, reverse=True) 
        return sorted_socres 

#### Build Language Model 

In [6]:
class LM_Model:
    def __init__(self,documents,a = 0.5 ,tokenizer=my_tokenizer): 
        self.tokenizer = tokenizer  
        self.alpha = a
        self.document_collection = [self.tokenizer(doc) for doc in documents] 
        self.document_corpus = list(itertools.chain.from_iterable(self.document_collection))
        self.corpus_term_prob = {}
        self.corpus_term_freq = FreqDist(self.document_corpus)
        self.vocabulary = self.corpus_term_freq.keys()
        self.lmp = []
        self.initialize()
    def initialize(self): 
        document_freq = [FreqDist(doc) for doc in self.document_collection]
        length_corpus = len(self.document_corpus)
        for term,occurs in self.corpus_term_freq.items():
            self.corpus_term_prob[term] = float(occurs)/float(length_corpus)
        for sent_freq in document_freq:
            tempDict = {}
            for term in self.vocabulary:
                upper = sent_freq.get(term,0) + self.alpha*self.corpus_term_prob.get(term,0)
                below = self.corpus_term_freq.get(term,0) + self.alpha
                tempDict[term] = float(upper)/float(below)
            self.lmp.append(tempDict)
    def get_lm_socres(self,Query):
        doc_socres = []
        Query = my_tokenizer(Query)
        for doc_prob in  self.lmp:
            term_score = []
            for term in Query:
                if term in self.vocabulary:
                    term_score.append(doc_prob[term])
            query_score = numpy.product(term_score)
            doc_socres.append(query_score)
        sorted_score = sorted(list(enumerate(doc_socres)), key=lambda (k,v): v, reverse=True)
        doc_indexs = [i for i in sorted_score]
        return doc_indexs
    def predict(self,questions,limit = 3):
        predictions = [] 
        for query in questions: 
            answers = self.get_lm_socres(query)[:limit] 
            prediction.append([i[0] for i in answers]) 
        return predictions 

#### Output Model Results

In [7]:
def write_csv(csv_name,dataset,limit=1):
    csv_file = open(csv_name, mode='w',)
    fieldnames = ['document_ID', 'question_ID','question','prediction_ID','prediction_sentence']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames, )
    writer.writeheader()

    ddi = 0
    for document in dataset:
        evaluate_row = []
        questions = [i['question'] for i in document['qa']]
        model = document['model']
        pred = model.predict(questions,limit)
        quest_index = 0
        for pred_index in pred:
            drow = dict()
            drow['question_ID'] = quest_index
            drow['prediction_ID'] = pred_index
            evaluate_row.append(drow)
            quest_index += 1
        doc_sents = document['sentences']
        for r in evaluate_row:
            r['document_ID'] = ddi
            r['question'] = questions[r['question_ID']].encode('utf-8')
            if len(r['prediction_ID']) != 0:
                r['prediction_sentence'] = doc_sents[r['prediction_ID'][0]].encode('utf-8')
            else:
                print 'error prediction',ddi,r['question_ID'],r['prediction_ID']
            writer.writerow(r)
        ddi += 1

In [19]:
#build model for each document collaction
for document in dataset:
    document_collections = document['sentences']
    document['model'] = tf_idf_Model(document_collections)
write_csv('tf_idf.csv',dataset,1)

error occured What does ITU-R stand for?
error occured What does D-VHS stand for?
error occured How is The Bahre-Nagassi translated?
error occured Who was the runner up?
error occured Who was the runner up?
error occured Where did it open?
error occured What did the actof of milno do?
error occured What is an Etsudiantinas? 
error occured What is crosspicking?
error occured When was Chanakya alive?
error occured What is q-glass?
error occured What hardens glass-ceramics?


In [9]:
for document in dataset:
    document_collections_sents = document['sentences']
    document['model'] = BM25_Model(document_collections_sents)
write_csv('test.csv',dataset,1)

error prediction 2 65 []
error prediction 2 192 []
error prediction 11 783 []
error prediction 18 182 []
error prediction 25 245 []
error prediction 36 89 []
error prediction 36 92 []


In [10]:
for document in dataset:
    document_collections_sents = document['sentences']
    document['model'] = LM_Model(document_collections_sents)
write_csv('lm.csv',dataset,1)

#### Import Prdictions from CSV file

In [15]:
def get_csv_vales(filename):
    preds = []
    with open(filename) as csvfile:
        readCSV = csv.DictReader(csvfile, delimiter=',')
        for row in readCSV:
            tempDict = {}
            tempDict['DocID'] = int(row['document_ID'])
            tempDict['Predictions'] = ast.literal_eval(row['prediction_ID'])
            tempDict['QuestionIndex'] = int(row['question_ID'])
            tempDict['Question'] = row['question']
            preds.append(tempDict)
    count = 0
    bingo = 0
    for d in preds:
        guess = d['Predictions'] 
        doc_i = d['DocID']
        qus_i = d['QuestionIndex']
        act_i = dataset[doc_i]['qa'][qus_i]['answer_sentence']
        if act_i in guess:
            bingo += 1
        count += 1
    print "correctness results :",float(bingo)/float(count)
    return preds

In [21]:
tf_idf = get_csv_vales('tf_idf.csv')

correctness results : 0.567765567766


In [22]:
lm = get_csv_vales('lm.csv')

correctness results : 0.618574973414


In [23]:
bm25 = get_csv_vales('test.csv')

correctness results : 0.625192012289


#### Add surrending for top one results

In [24]:
def ADDsurreding(g):
    n = []
    for i in g:
        if i != 0:
            n.append(i+1)
            n.append(i-1)
        else:
            n.append(i+1)
    return g+n

In [30]:
count = 0
bingo = 0
for d in bm25:
    guess = d['Predictions'] 
    doc_i = d['DocID']
    qus_i = d['QuestionIndex']
    new_guess = ADDsurreding(guess)
    act_i = dataset[doc_i]['qa'][qus_i]['answer_sentence']
    if act_i in new_guess:
        bingo += 1
    count += 1
print "results after add surrending:",float(bingo)/float(count)

results after add surrending: 0.697388632873


### Entity Extraction

In [None]:
def json_load_byteified(file_handle):
    return _byteify(json.load(file_handle, object_hook=_byteify),ignore_dicts=True)

def _byteify(data, ignore_dicts = False):
    # if this is a unicode string, return its string representation
    if isinstance(data, unicode):
        return data.encode('utf-8')
    # if this is a list of values, return list of byteified values
    if isinstance(data, list):
        return [ _byteify(item, ignore_dicts=True) for item in data ]
    # if this is a dictionary, return dictionary of byteified keys and values
    # but only if we haven't already byteified it
    if isinstance(data, dict) and not ignore_dicts:
        return {
            _byteify(key, ignore_dicts=True): _byteify(value, ignore_dicts=True)
            for key, value in data.iteritems()
        }
    # if it's anything else, return it in its original form
    return data

with open("data/QA_test.json") as json_file:
    json_data = json_load_byteified(json_file)
print 'import success'

In [None]:
cwd = os.getcwd()
st = StanfordNERTagger(cwd+'\data\english.all.3class.distsim.crf.ser.gz',cwd+'\data\stanford-ner.jar')

if not os.path.isfile("NERtest.json"):    
    start = time.time()
    progressT = len(json_data)    
    listOfDocument=[]
    i=0
    for jd in json_data:
        aList=[]        
        aList.append(st.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x['question']))) for x in jd['qa']]))
        #remove the below file if running on test set
        #aList.extend([st.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x['answer']))) for x in jd['qa']])])
        aList.append(st.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x))) for x in jd['sentences']]))
        listOfDocument.append(aList)
        i+=1
        sys.stdout.write('\r')
        sys.stdout.write("%d%%" % (i*100/progressT))
        sys.stdout.flush()    
    for document in range(0,len(listOfDocument)):
        #change [2] to [1] if test set
        for sentence in range(0,len(listOfDocument[document][1])):
            for word in range(0,len(listOfDocument[document][1][sentence])):   
                listOfDocument[document][1][sentence][word]= (listOfDocument[document][1][sentence][word][0],listOfDocument[document][1][sentence][word][1] if not listOfDocument[document][1][sentence][word][0].isdigit() else u'NUMBER')
    with open('NERtest.json', 'w') as outfile:
        json.dump(listOfDocument, outfile)
    end = time.time()
    print '\nTime spending:',end - start    
else:    
    print 'there is a file'
with open("NERtest.json") as json_file:
        json_dataNER = json_load_byteified(json_file)