# Basic Q&A System

## Sentence Retrival

#### Preprocessing 

In [13]:
import math,numpy,json,re,nltk,csv,time,re,os.path,sys,ast,itertools,string
from operator import add
from math import sqrt
from numpy import multiply
from nltk import FreqDist, DictionaryProbDist
from nltk.tokenize import word_tokenize,RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer as WNL
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger
from nltk.parse.stanford import StanfordDependencyParser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn import metrics
from collections import defaultdict,OrderedDict
from nltk import RegexpParser


stopwords = set(nltk.corpus.stopwords.words('english')) # wrap in a set() (see below)

#### Load data from json file

In [4]:
t0 = time.time()

test_path = os.path.abspath('data/QA_test.json')
dev_path = os.path.abspath('data/QA_dev.json')
train_path = os.path.abspath('data/QA_train.json')

def load_jsonfile(filepath):
    dataset = []
    with open(filepath) as jsonfile:
        for line in jsonfile:
            dataset += (json.loads(line))
    return dataset


dataset_dev = load_jsonfile(dev_path)
dataset_test = load_jsonfile(test_path)
dataset_train = load_jsonfile(train_path)
print "Import Successful "
print "There are totally", len(dataset_dev),'documents in this dev dataset'
print "There are totally", len(dataset_test),'documents in this test dataset'
print "There are totally", len(dataset_train),'documents in this  train dataset'

Import Successful 
There are totally 40 documents in this dev dataset
There are totally 42 documents in this test dataset
There are totally 360 documents in this  train dataset


#### Build tf-idf model 

In [18]:
stemmer = nltk.stem.PorterStemmer() 
from nltk import word_tokenize
def my_tokenizer(doc):
    terms = set()
    for token in word_tokenize(doc):
        if token not in stopwords and token not in string.punctuation: # 'in' and 'not in' operations are much faster over sets that lists
            terms.add(stemmer.stem(token.lower()))
    return list(terms)

In [19]:
class tf_idf_Model:
    def __init__(self, collection):
        self.vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, use_idf=True,norm='l2',stop_words=None, tokenizer=my_tokenizer)
        self.collection_matrix = self.vectorizer.fit_transform(document_collections)
        feature_array = self.vectorizer.get_feature_names()
        self.features = dict()
        for index in range(len(feature_array)):
            term = feature_array[index]
            self.features[term] = index

    def predict(self, queryX,limit=3):
        predictions = [self.inverted_index_score(i,limit) for i in  queryX]
        return predictions

    def inverted_index_score(self, query_sent,limit=3):
        """
        now we implement inverted index to handle query
        
        :param query_sent: 
        :return: 
        
        """
        query_words = my_tokenizer(query_sent)
        score = defaultdict(float)

        for w in query_words:
            try:
                col_i = self.features[w]
                inverted_ix = self.collection_matrix[:, col_i]
                for doc_i in range(inverted_ix.shape[0]):
                    score[doc_i] += inverted_ix[doc_i, 0]
            except KeyError:
                pass

        index_score = sorted(score.items(), key=lambda (k, v): v, reverse=True)

        if index_score:
            doc_indexs = [i[0] for i in index_score[:limit]]
            return doc_indexs
        else:
            return []

#### Build BM25 model

In [20]:
class BM25_Model(object): 
    def __init__(self, document_collection, k1=1.5, b=0.75, k3=0.0, EPS=0.25, tokenizer=my_tokenizer): 
        self.tokenizer = tokenizer 
        self.document_collection_length = len(document_collection) 
        self.avg_doc_length = sum(map(lambda x: float(len(x)), document_collection)) / self.document_collection_length 
        self.document_collection = [self.tokenizer(doc) for doc in document_collection] 
        self.document_corpus = list(itertools.chain.from_iterable(self.document_collection))
        self.corpus_freq = FreqDist(self.document_corpus)
        self.tf = [] 
        self.df = defaultdict(int) 
        self.bm25_idf = defaultdict(float) 
        self.average_idf = -1 
        self.k1 = k1 
        self.k3 = k3 
        self.EPSILON = EPS 
        self.b = b 
        self.inverted_index = defaultdict(list) 
        self.initialize() 
    def initialize(self):
        for index, document in enumerate(self.document_collection): 
            doc_term_freq = FreqDist(document) 
            self.tf.append(doc_term_freq) 
            
            for word, freq in doc_term_freq.items(): 
                self.df[word] += 1 
                self.inverted_index[word].append(index) 
        for word, freq in self.df.items(): 
            self.bm25_idf[word] = math.log(self.document_collection_length - freq + 0.5) - math.log(freq + 0.5) 
            
            self.average_idf = sum(map(lambda k: float(self.bm25_idf[k]), self.bm25_idf.keys())) / len(self.bm25_idf.keys()) 
    
    def predict(self, queryX, limit=1): 
        q_prediction = [] 
        for query in queryX: 
            answers = self.bm25_get_most_relevant(query)[:limit] 
            if len(answers) == 0:
                q_prediction.append([]) 
            else:
                q_prediction.append([i[0] for i in answers]) 
        return q_prediction 

    def bm25_get_most_relevant(self, query): 
        query_tks = self.tokenizer(query) 
        scores = defaultdict(float) 
        for q_token in query_tks: 
            for doc_index in self.inverted_index[q_token]: 
                idf = self.bm25_idf[q_token] if self.bm25_idf[q_token] >= 0 else self.EPSILON * self.average_idf 
                top = self.tf[doc_index][q_token] * (self.k1 + 1) 
                below = self.tf[doc_index][q_token] + self.k1 * (1 - self.b + self.b * self.document_collection_length / self.avg_doc_length) 
                frq_q_t = self.corpus_freq[q_token]
                scores[doc_index] += idf * top / below *(self.k3 +1)*frq_q_t/(self.k3+frq_q_t)
        prels = scores.items() 
        sorted_socres = sorted(prels, key=lambda (k, v): v, reverse=True) 
        return sorted_socres 

#### Build Language Model 

In [21]:
class LM_Model:
    def __init__(self,documents,a = 0.5 ,tokenizer=my_tokenizer): 
        self.tokenizer = tokenizer  
        self.alpha = a
        self.document_collection = [self.tokenizer(doc) for doc in documents] 
        self.document_corpus = list(itertools.chain.from_iterable(self.document_collection))
        self.corpus_term_prob = {}
        self.corpus_term_freq = FreqDist(self.document_corpus)
        self.vocabulary = self.corpus_term_freq.keys()
        self.lmp = []
        self.initialize()
    def initialize(self): 
        document_freq = [FreqDist(doc) for doc in self.document_collection]
        length_corpus = len(self.document_corpus)
        for term,occurs in self.corpus_term_freq.items():
            self.corpus_term_prob[term] = float(occurs)/float(length_corpus)
        for sent_freq in document_freq:
            tempDict = {}
            for term in self.vocabulary:
                upper = sent_freq.get(term,0) + self.alpha*self.corpus_term_prob.get(term,0)
                below = self.corpus_term_freq.get(term,0) + self.alpha
                tempDict[term] = float(upper)/float(below)
            self.lmp.append(tempDict)
    def get_lm_socres(self,Query):
        doc_socres = []
        Query = my_tokenizer(Query)
        for doc_prob in  self.lmp:
            term_score = []
            for term in Query:
                if term in self.vocabulary:
                    term_score.append(doc_prob[term])
            query_score = numpy.product(term_score)
            doc_socres.append(query_score)
        sorted_score = sorted(list(enumerate(doc_socres)), key=lambda (k,v): v, reverse=True)
        doc_indexs = [i for i in sorted_score]
        return doc_indexs
    def predict(self,questions,limit = 3):
        predictions = [] 
        for query in questions: 
            answers = self.get_lm_socres(query)[:limit] 
            predictions.append([i[0] for i in answers]) 
        return predictions 

#### Output Various Model Results

In [22]:
def write_csv(csv_name,model_name,dataset,limit=1):
    csv_file = open(csv_name, mode='w',)
    fieldnames = ['document_ID', 'question_ID','question','prediction_ID','prediction_sentence']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames, )
    writer.writeheader()

    ddi = 0
    for document in dataset:
        evaluate_row = []
        questions = [i['question'] for i in document['qa']]
        model = document[model_name]
        predictions = model.predict(questions,limit)
        quest_index = 0
        for pred_index in predictions:
            drow = dict()
            drow['question_ID'] = quest_index
            drow['prediction_ID'] = pred_index
            evaluate_row.append(drow)
            quest_index += 1
        doc_sents = document['sentences']
        for r in evaluate_row:
            r['document_ID'] = ddi
            r['question'] = questions[r['question_ID']].encode('utf-8')
            if len(r['prediction_ID']) != 0:
                r['prediction_sentence'] = doc_sents[r['prediction_ID'][0]].encode('utf-8')
            else:
                print 'error prediction',ddi,r['question_ID'],r['question']
            writer.writerow(r)
        ddi += 1

In [23]:
#build model for each document collaction
for document in dataset_dev:
    document_collections = document['sentences']
    document['tf_idf_model'] = tf_idf_Model(document_collections)
    document['bm25_model'] = BM25_Model(document_collections)
    document['lm_model'] = LM_Model(document_collections)
print 'The following are errors made by different models,doucnment Index,Query Index, Query'
print ''
write_csv('data/tf_idf_dev_predictions.csv','tf_idf_model',dataset_dev,1)
print '----------------------------------------------------'
write_csv('data/bm25_dev_predictions.csv','bm25_model',dataset_dev,1)
print '----------------------------------------------------'
write_csv('data/lm_dev_predictions.csv','lm_model',dataset_dev,1)

The following are errors made by different models,doucnment Index,Query Index, Query

error prediction 11 317 Who was the runner up?
error prediction 11 520 Who was the runner up?
error prediction 11 783 Where did it open?
error prediction 18 182 What did the actof of milno do?
error prediction 22 94 What is an Etsudiantinas? 
error prediction 22 181 What is crosspicking?
error prediction 25 245 When was Chanakya alive?
----------------------------------------------------
error prediction 11 783 Where did it open?
error prediction 18 182 What did the actof of milno do?
error prediction 25 245 When was Chanakya alive?
----------------------------------------------------


#### Import different result and compare predictions from CSV file

In [24]:
def check_prediction_accuracy(filename):
    preds = []
    with open(filename) as csvfile:
        readCSV = csv.DictReader(csvfile, delimiter=',')
        for row in readCSV:
            tempDict = {}
            tempDict['DocID'] = int(row['document_ID'])
            tempDict['Predictions'] = ast.literal_eval(row['prediction_ID'])
            tempDict['QuestionIndex'] = int(row['question_ID'])
            tempDict['Question'] = row['question']
            preds.append(tempDict)
    count = 0
    bingo = 0
    for d in preds:
        guess = d['Predictions'] 
        doc_i = d['DocID']
        qus_i = d['QuestionIndex']
        act_i = dataset_dev[doc_i]['qa'][qus_i]['answer_sentence']
        if act_i in guess:
            bingo += 1
        count += 1
    print "Model correctness results :",float(bingo)/float(count)

In [25]:
check_prediction_accuracy('data/tf_idf_dev_predictions.csv')
check_prediction_accuracy('data/bm25_dev_predictions.csv')
check_prediction_accuracy('data/lm_dev_predictions.csv')

Model correctness results : 0.584308164953
Model correctness results : 0.647879002718
Model correctness results : 0.638071605814


#### Choose BM25  Model and predict sentence  for test set
* in this case BM25 will be used for sentence retrival

In [27]:
#Build models for test data set
for document in dataset_test:
    document_collections = document['sentences']
    document['bm25_model'] = BM25_Model(document_collections)
# write to a CSV file for test data predictions
write_csv('data/bm25_test_predictions.csv','bm25_model',dataset_test,1)

error prediction 4 95 What does ICRISTAT stand for?
error prediction 9 75 What is okanye?
error prediction 10 46 What was destroyed in the fire?
error prediction 10 118 What is the Barengraben?
error prediction 17 67 What are phrynges?
error prediction 19 74 What happeded between 1927-1934?
error prediction 20 106 What is 'gorg'?
error prediction 36 107 When was the overbudget amount discovered?


## Entity Extraction

In [17]:
cwd = os.getcwd()
path_to_jar = 'data/stanford-parser.jar'
path_to_models_jar = 'data/stanford-parser-3.7.0-models.jar'
NERtagger = StanfordNERTagger(cwd+'/data/english.all.3class.distsim.crf.ser.gz',cwd+'/data/stanford-ner.jar')
POStagger = StanfordPOSTagger(cwd+'/data/wsj-0-18-left3words-distsim.tagger',cwd+'/data/stanford-postagger.jar') 

def NERandPOS_writer(filenameNER,filenamePOS,dataset,l):
    if not os.path.isfile(filenameNER):    
        start = time.time()
        progressT = len(dataset)    
        listOfDocumentNER=[]
        listOfDocumentPOS=[]
        i=0
        for jd in dataset:
            aListNER=[]
            aListPOS=[]
            aListNER.append(NERtagger.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x['question']))) for x in jd['qa']]))            
            aListPOS.append(POStagger.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x['question']))) for x in jd['qa']]))
            if l!=1:
                aListNER.append(NERtagger.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x['answer']))) for x in jd['qa']]))
                aList.append(POStagger.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x['answer']))) for x in jd['qa']]))                         
            aListPOS.append(POStagger.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x))) for x in jd['sentences']]))
            aListNER.append(NERtagger.tag_sents([word_tokenize(re.sub(',', '',re.sub('[^a-zA-Z0-9-_*., ]', ' ',x))) for x in jd['sentences']]))
            listOfDocumentNER.append(aListNER)
            listOfDocumentPOS.append(aListPOS)
            i+=1
            sys.stdout.write('\r')
            sys.stdout.write("Processing: %d%%" % (i*100/progressT))
            sys.stdout.flush()    
        if l!=1:
            for document in range(0,len(listOfDocumentNER)):
                for sentence in range(0,len(listOfDocumentNER[document][1])):
                    for word in range(0,len(listOfDocumentNER[document][1][sentence])):   
                        listOfDocumentNER[document][1][sentence][word]= \
                        (listOfDocumentNER[document][1][sentence][word][0],\
                         listOfDocumentNER[document][1][sentence][word][1] \
                         if not listOfDocumentNER[document][1][sentence][word][0].isdigit() else u'NUMBER')
        with open(filenameNER, 'w') as outfile:
            json.dump(listOfDocumentNER, outfile)
        with open(filenamePOS, 'w') as outfile:
            json.dump(listOfDocumentPOS, outfile)
        end = time.time()
        print '\nTime spending:',end - start    
    else:    
        print filename,'is alrady exist'

In [18]:
NERandPOS_writer('data/NERtest.json','data/POStest.json',dataset_test,1)

Processing: 100%
Time spending: 225.929000139


In [25]:
t1 = time.time() - t0
print 'Running time is ',t1

Running time is  974.636730194


##  Answer rank 

#### Building Ranking Systems

In [19]:
class answer_rank():
    def __init__(self,json_data,json_dataOrg,json_dataPOS,json_dataPOSOrg):
        self.json_data = json_data
        self.json_dataOrg = json_dataOrg
        self.json_dataPOS = json_dataPOS
        self.json_dataPOSOrg = json_dataPOSOrg
        self.dictDoc = {}
        self.dictDocOrg = {}
        self.initialize()

        
    def initialize(self):

        for document in range(len(self.json_data)):
            for thing in range(len(self.json_data[document])):
                for sentence in range(len(self.json_data[document][thing])):
                    for word in range(len(self.json_data[document][thing][sentence])):
                        self.json_data[document][thing][sentence][word][1] = 'U'\
                        if word!=0 and self.json_data[document][thing][sentence][word][0][0].isupper()\
                        and self.json_data[document][thing][sentence][word][1]=='O'\
                        else self.json_data[document][thing][sentence][word][1]
                        
                        self.json_data[document][thing][sentence][word][0] = self.json_data[document][thing][sentence][word][0].lower()
        
                        if self.json_data[document][thing][sentence][word][0] \
                        in ['one','two','three','four','five','six','seven','eight','nine','ten','zero']\
                        or self.isfloat(self.json_data[document][thing][sentence][word][0]):
                            self.json_data[document][thing][sentence][word][1] = 'NUMBER'
        for document in range(len(self.json_dataPOS)):
            for thing in range(len(self.json_dataPOS[document])):
                for sentence in range(len(self.json_dataPOS[document][thing])):
                    for word in range(len(self.json_dataPOS[document][thing][sentence])):
                        self.json_dataPOS[document][thing][sentence][word][0] = self.json_dataPOS[document][thing][sentence][word][0].lower()
                        if self.have_number(self.json_dataPOS[document][thing][sentence][word][0]):
                            self.json_dataPOS[document][thing][sentence][word][1] = 'CD'
        print 'NER json file import to system successful'
        
    
    def have_number(self,s):
        return any(i.isdigit() for i in s)

    def isfloat(self,value):
        try:
            float(value)
            return True
        except ValueError:
            return False

    def readCSV(self,filename,devdata = False):
        csv_file = filename
        i=0
        answerSecondFilter = []
        dictDocOrg={}
        t_i = 1
        if devdata:
            t_i = 2            
        with open(csv_file, 'rb') as csvfile:
            readCSV = csv.DictReader(csvfile, delimiter=',')
            for row in readCSV:
                document_i = int(row['document_ID'])
                question_i = int(row['question_ID'])
                filteredlistOfNERSentence = []
                question_type = []
                predictionList = ast.literal_eval(row['prediction_ID'])
                prediction_index = predictionList[0] if len(predictionList)!=0 else 1
                
                print document_i,question_i,prediction_index,t_i
                question_type = self.detectQuestion(document_i,question_i,prediction_index,t_i)
                
                self.dictDoc[document_i,question_i]=prediction_index,question_type                                              



    def detectQuestion(self,i, j, k, l):
        openclassword=[]
        kindOfAnswer = []
        questionPOS = self.json_dataPOS[i][0][j]
        questionNER = self.json_data[i][0][j]        
        answerPOS = self.json_dataPOS[i][l][k]
        answerNER = self.json_data[i][l][k]
        specialcommand=[]
        #print questionNER, answerPOS
        originalWithout = [x[0] for x in questionPOS]
        originalWithoutA = [x[0] for x in answerPOS]
        getIndexOfWH = [y for y,x in enumerate(questionPOS) if 'W' in x[1]]
        #if wh
        if len(getIndexOfWH) != 0:
            getPOS = ''.join([x[1][0] for x in questionPOS])        

            #not at last word
            if getIndexOfWH[0]+1!=len(questionPOS):            
                searchWordAfterWh1 = re.search('W(.*?)V', getPOS, re.IGNORECASE)
                if searchWordAfterWh1:
                    if len(searchWordAfterWh1.group(1))!=0:
                        openclassword.append([searchWordAfterWh1.start()+len(searchWordAfterWh1.group(1))])                    
                    else:
                        openclassword.append([])
                else:
                    searchWordAfterWh1 = re.search('W(.*?)', getPOS, re.IGNORECASE)
                    if searchWordAfterWh1:
                        openclassword.append([searchWordAfterWh1.start()+len(searchWordAfterWh1.group(0))-1])
                    else:
                        openclassword.append([])
                frontPart = range(0,searchWordAfterWh1.start()) if searchWordAfterWh1 else range(0,len(getPOS)/2)
                backPart = range(searchWordAfterWh1.start()+len(searchWordAfterWh1.group(0)),len(getPOS)) if searchWordAfterWh1 else range(len(getPOS)/2,len(getPOS))                      
                openclassword.append(frontPart+backPart)      
                #print openclassword
            else:
                openclassword=[[],[y for y,x in enumerate(questionPOS)][:-1]]                        
            #remove Stopwords        
            openclassword[1] = [x for x in openclassword[1] if questionPOS[x][0] not in stopwords]
            if len(openclassword[0])!=0:            
                numberIndicator = ['year','length','percentage', 'many','much']            
                for x in numberIndicator: 
                    for y in originalWithout[getIndexOfWH[0]:openclassword[0][0]+2]:
                        if x in y:
                            kindOfAnswer = ['NUMBER'] if len(kindOfAnswer)==0 else kindOfAnswer
                            specialcommand.append(x)
                personIndicator = ['name']
                for x in personIndicator:
                    for y in originalWithout[getIndexOfWH[0]:openclassword[0][0]+2]:
                        if x in y:
                            kindOfAnswer = ['PERSON','ORGANIZATION'] if len(kindOfAnswer)==0 else kindOfAnswer
                placeIndicator = ['location','place','country','city','area']
                for x in placeIndicator:
                    for y in originalWithout[getIndexOfWH[0]:openclassword[0][0]+2]:
                        if x in y:
                            kindOfAnswer = ['LOCATION'] if len(kindOfAnswer)==0 else kindOfAnswer
                            specialcommand=['location']
                if len(kindOfAnswer)==0:    
                    kindOfAnswer = ['O','U']
            else:
                numberIndicator = ['when']            
                personIndicator = ['who','whom','whose','name']
                placeIndicator = ['where','located']
                if originalWithout[getIndexOfWH[0]] in numberIndicator:
                    kindOfAnswer = ['NUMBER']
                    specialcommand = ['year']
                elif originalWithout[getIndexOfWH[0]] in personIndicator:
                    kindOfAnswer = ['PERSON','ORGANIZATION']
                elif originalWithout[getIndexOfWH[0]] in placeIndicator:
                    kindOfAnswer = ['LOCATION']
                    specialcommand=['location']
                else:
                    if len([y for y in openclassword[1] if questionPOS[y][0] in personIndicator])!=0:
                        kindOfAnswer = ['PERSON','ORGANIZATION']
                    elif len([y for y in openclassword[1] if questionPOS[y][0] in placeIndicator])!=0:
                        kindOfAnswer = ['LOCATION']
                        specialcommand=['location']
                    else: 
                        kindOfAnswer = ['O','U']
        else:
            openclassword = [[],[x for x,y in enumerate(originalWithout) if y not in stopwords]]
            kindOfAnswer = ['O','U']
        #determine whether it requires number entity
        newList1=[]
        newList2=[]
        for x in range(len(openclassword[0])):
            if originalWithout[openclassword[0][x]] in originalWithoutA:            
                newList1.extend([f for f,h in enumerate(originalWithoutA) if originalWithout[openclassword[0][x]] == h])        
        for x in range(len(openclassword[1])):
            if originalWithout[openclassword[1][x]] in originalWithoutA:
                newList2.extend([f for f,h in enumerate(originalWithoutA) if originalWithout[openclassword[1][x]] == h])
        openclassword[0]=newList1
        openclassword[1]=newList2
        return openclassword, kindOfAnswer,specialcommand     

    def createNP(self,answerToReturn,answerPOS,specialcommand,q,j):
        newAnswer = answerPOS[answerToReturn[1]][0]
        if (newAnswer.isdigit() and 'year' not in specialcommand):
            newAnswer = "{:,}".format(int(answerPOS[answerToReturn[1]][0]))
        for i in range(answerToReturn[1],0,-1):
            if answerToReturn[1] != 0:
                if answerPOS[i][1] =='NNP':
                    if 'NNP' in answerPOS[i-1][1] or (('DT' in answerPOS[i-1][1] or 'IN' in answerPOS[i-1][1]) \
                                                      and answerPOS[i-1][0] !='at' \
                                                      and 'location' in specialcommand):
                        newAnswer = answerPOS[i-1][0]+ " " +newAnswer
                    else:
                        break
                elif answerPOS[i][1] =='NN':
                    if 'JJ' in answerPOS[i-1][1] or 'DT' in answerPOS[i-1][1] \
                    or 'CD' in answerPOS[i-1][1] or answerPOS[i-1][1] =='NN':
                        newAnswer = answerPOS[i-1][0]+ " " +newAnswer
                    else:
                        break
                elif answerPOS[i][1] =='JJ':
                    if answerPOS[i-1][1] =='JJ':
                        newAnswer = answerPOS[i-1][0]+ " " +newAnswer
                    else:
                        break
                elif answerPOS[i][1] =='NNS':
                    if 'JJ' in answerPOS[i-1][1]:
                        newAnswer = answerPOS[i-1][0]+ " " +newAnswer
                    else:
                        break
                elif 'DT' in answerPOS[i][1] and 'location' not in specialcommand:
                    if 'TO' in answerPOS[i-1][1]:
                        newAnswer = answerPOS[i-1][0]+ " " +newAnswer
                    else:
                        break
                else:
                    if answerPOS[i][1] !='CD' and answerPOS[i][1] !='RB':
                        if 'NN' in answerPOS[i-1][1] or 'JJ' in answerPOS[i-1][1] or 'RB' in answerPOS[i-1][1]:
                            newAnswer = answerPOS[i-1][0]+ " " +newAnswer
                        else:
                            break
                    else:
                        break
            else:
                break
        for i in range(answerToReturn[1],len(answerPOS)-1):
            if answerToReturn[1] != len(answerPOS)-1:
                if answerPOS[i][1] =='NNP':                
                    if 'NNP' in answerPOS[i+1][1] or 'CC' in answerPOS[i+1][1] or 'IN' in answerPOS[i+1][1] or 'TO' in answerPOS[i+1][1] or ('NN' in answerPOS[i+1][1] and 'location' in specialcommand):                                        
                        newAnswer = newAnswer+' '+answerPOS[i+1][0]
                    else:                    
                        break
                elif answerPOS[i][1] =='CC':
                    if 'NNP' in answerPOS[i+1][1] or 'NN' == answerPOS[i+1][1]:
                        newAnswer = newAnswer+' '+answerPOS[i+1][0]
                    else:                    
                        break
                elif answerPOS[i][1] =='CD':
                    if 'CD' in answerPOS[i+1][1]:                 
                        newAnswer = newAnswer+' '+answerPOS[i+1][0]
                    else:                    
                        break
                elif answerPOS[i][1] =='TO' or answerPOS[i][1] =='DT':
                    if 'NN' in answerPOS[i+1][1] or 'RB' in answerPOS[i+1][1]:
                        newAnswer = newAnswer+' '+answerPOS[i+1][0]
                    else:
                        break 
                elif answerPOS[i][1] =='IN' and 'location' not in specialcommand:
                    if answerPOS[i+1][1] =='DT' or answerPOS[i+1][1] =='NNP' or answerPOS[i+1][1] =='TO' or answerPOS[i+1][1] =='CD':
                        newAnswer = newAnswer+' '+answerPOS[i+1][0]
                    else:
                        break
                elif answerPOS[i][1] =='JJ':
                    if 'NNS' in answerPOS[i+1][1] or 'NN' == answerPOS[i+1][1] or answerPOS[i+1][1] =='JJ':
                        newAnswer = newAnswer+' '+answerPOS[i+1][0]
                    else:
                        break
                elif answerPOS[i][1] =='NN':
                    if 'NN' in answerPOS[i+1][1] or 'JJ' in answerPOS[i+1][1] or 'IN' in answerPOS[i+1][1] or 'CC' in answerPOS[i+1][1]:
                        newAnswer = newAnswer+' '+answerPOS[i+1][0]
                    else:
                        break
                elif answerPOS[i][1] =='NNS':
                    if 'IN' in answerPOS[i+1][1] and 'ORG' in specialcommand:
                        newAnswer = newAnswer+' '+answerPOS[i+1][0]
                    else:
                        break
                else:
                    break
            else:
                break
        buffernewAnswer1 = newAnswer.split()
        if buffernewAnswer1[-1]=='along' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-6]
        if buffernewAnswer1[-1]=='and' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-4]
        if buffernewAnswer1[-1]=='for' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-4]
        if buffernewAnswer1[-1]=='but' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-4]
        if buffernewAnswer1[-1]=='in' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-3]
        if buffernewAnswer1[-1]=='as' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-3]
        if buffernewAnswer1[-1]=='at' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-3]
        if buffernewAnswer1[-1]=='on' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-3]
        if buffernewAnswer1[-1]=='to' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-3]
        if buffernewAnswer1[-1]=='while' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-6]
        if buffernewAnswer1[-1]=='despite' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-8]
        if buffernewAnswer1[-1]=='because' and len(buffernewAnswer1)>1:
            newAnswer=newAnswer[:-8]
        if buffernewAnswer1[-1].isdigit() and len(buffernewAnswer1[-1])==4 and 'year' in specialcommand:
            newAnswer=buffernewAnswer1[-1]
        if buffernewAnswer1[-1].isdigit() and 'location' in specialcommand:
            newAnswer=buffernewAnswer1[-1]
        if len(specialcommand)!=0 and 'percentage' in specialcommand[0]:
            newAnswer+='%'
        if 'location' in specialcommand:
            bufferNewAnswer=[]        
            for x in newAnswer.split():
                for y in self.json_dataOrg[q]['sentences'][j].split():
                    if x in y:
                        bufferNewAnswer.append(y)
                        break
            newAnswer=' '.join(bufferNewAnswer)
            newAnswer = newAnswer[:-1] if (len(newAnswer)>1 and not (newAnswer[-1].isalnum() or newAnswer[-1]=='%')) else newAnswer
        newAnswer = newAnswer.replace(',','-COMMA-')
        return newAnswer   

    
    def returnAnswer(self,i,j,l):
        answerListPOS = self.json_dataPOS[i][l][self.dictDoc[i,j][0]]    
        answerListNER = self.json_data[i][l][self.dictDoc[i,j][0]]    
        question_type = self.dictDoc[i,j][1][1]
        #print answerListPOS,question_type
        question_typeLocationInAnswer = [x for x,y in enumerate(answerListNER) \
                                         if ('CD' in answerListPOS[answerListNER.index(y)][1] \
                                             or 'NN' in answerListPOS[answerListNER.index(y)][1]) \
                                         and y[1] in question_type and y[0] not in stopwords \
                                         and x not in self.dictDoc[i,j][1][0][0] \
                                         and x not in self.dictDoc[i,j][1][0][1]]
        #print question_typeLocationInAnswer
        if len(question_typeLocationInAnswer) ==0:
            question_typeLocationInAnswer = [x for x,y in enumerate(answerListNER) \
                                             if ('CD' in answerListPOS[answerListNER.index(y)][1] \
                                                 or 'NN' in answerListPOS[answerListNER.index(y)][1]) \
                                             and y[0] not in stopwords and x not in self.dictDoc[i,j][1][0][0] \
                                             and x not in self.dictDoc[i,j][1][0][1]]
        scoreList = 0
        if len(self.dictDoc[i,j][1][0][0])!=0:
            maxScore = sys.maxint        
            for x in question_typeLocationInAnswer:
                score = sum([math.fabs(z-x) for z in self.dictDoc[i,j][1][0][0]])
                if maxScore>score:
                    maxScore=score
                    scoreList = x
        else:
            maxScore = sys.maxint
            scoreList = 0
            for x in question_typeLocationInAnswer:
                score = sum([math.fabs(z-x) for z in self.dictDoc[i,j][1][0][1]])
                if maxScore>score:
                    maxScore=score
                    scoreList = x
                    
        answerToReturn = (self.json_dataPOSOrg[i][l][self.dictDoc[i,j][0]][scoreList][0],scoreList)
        answerPOS = self.json_dataPOSOrg[i][l][self.dictDoc[i,j][0]]
        specialcommand = self.dictDoc[i,j][1][2]
        document_index = i
        question_index = self.dictDoc[i,j][0]
        
        answer = self.createNP(answerToReturn,answerPOS,specialcommand,document_index,question_index)
        
        return answer

    def writeToFile(self,filename,devdata = False):
        t_i = 1
        if devdata:
            t_i = 2
        with open(filename, mode='wb',) as csv_file:
            if devdata :
                fieldnames = ['document_id','question_id','answer_predict',"answer_actual",'tag','sentence','predict','question_type','question']
            else:
                fieldnames = ['id','answer']
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames,delimiter=',')
            writer.writeheader()
            k = 0
            doc_size=len(self.json_data)
            for i in range(0, doc_size):
                for j in range(0,len(self.json_data[i][0])):
                    k+=1            
                    dictToCSV={}
                    if devdata:
                        dictToCSV['document_id'] = i
                        dictToCSV['question_id'] = j
                        try:
                            dictToCSV['answer_predict']= self.returnAnswer(i,j,t_i)
                        except:
                            print '\nErrors on dev set return answers',i,j,devdata
                        dictToCSV['answer_actual'] = self.json_dataOrg[i]['qa'][j]['answer']
                        dictToCSV['tag'] =self.dictDoc[i,j]
                        dictToCSV['question_type'] = self.dictDoc[i,j][1][1]
                        dictToCSV['question'] = self.json_dataOrg[i]['qa'][j]['question']
                        dictToCSV['sentence'] = self.json_dataOrg[i]['qa'][j]['answer_sentence']
                        dictToCSV['predict'] = self.dictDoc[i,j][0]
                    else:
                        dictToCSV={}
                        dictToCSV['id'] = k
                        try:
                            dictToCSV['answer'] = self.returnAnswer(i,j,t_i)
                        except:
                            print '\nError on test return answers',i,j,t_i
                    writer.writerow(dictToCSV)    
                    csv_file.flush()
                sys.stdout.write('\r')
                sys.stdout.write(str(k))
                sys.stdout.flush()
        csv_file.close()
        print '\nsuccess'

#### Loading JSON files for dev and test dataset

In [41]:
json_data_dev = load_jsonfile('data/NERdev.json')
json_dataPOS_dev = load_jsonfile('data/POSdev.json')
json_dataOrg_dev = load_jsonfile('data/QA_dev.json')
json_dataPOSOrg_dev = load_jsonfile('data/POSdev.json')  

In [20]:
json_data_test = load_jsonfile('data/NERtest.json')
json_dataPOS_test = load_jsonfile('data/POStest.json')
json_dataOrg_test = load_jsonfile('data/QA_test.json')
json_dataPOSOrg_test = load_jsonfile('data/POStest.json') 

#### Output data results

In [21]:
qa_dev = answer_rank(json_data_dev,json_dataOrg_dev,json_dataPOS_dev,json_dataPOSOrg_dev)
qa_dev.detectQuestion(0,0,1,2)
answerNER = qa_dev.json_data[0][2][1]
qa_dev.readCSV("data/bm25_dev_predictions.csv",True)
qa_dev.writeToFile("data/bm25_dev_result.csv",True)

qa_test = answer_rank(json_data_test,json_dataOrg_test,json_dataPOS_test,json_dataPOSOrg_test)
qa_test.readCSV("data/bm25_test_predictions.csv",devdata = False)
qa_test.writeToFile("data/bm25_test_result.csv",devdata =False)

NameError: name 'json_data_dev' is not defined

# Error Analysis

** Task 1)** Sentence Retrival:<br >
> **Errors Found**: Errors on prediciting sentences.so there are few sentence that do not have any predictions <br>

> **Reasons**: from tf-idf model there are 6 sentences which is can not produce any relevent sentences to them, after tokenzie those query.For example 'who was then runner up' , the query will left token 'runner' in this query or ecen nothing if all tokens are belong to stopwords.but the document corpus dost not exit. therefore base on current algorithm,  tf-idf model wont predict any sentencs secore back<br> 
However, thery another resons its the Unicode , some the recouse sentences might contain Latin terms, which can not match with ASCII code example of 'chāṇakya' but the query term is 'chanakya', and lastly the typo errors also might occurs, example of term 'mielno' but query term is 'milno'therore it wont reusult answers, <br>

>**Imporovement**: there are two possible solutions when those query can not extract relevant sentences. first one is change model to language model. after we smooth langurage model. the wont occur zero probabbblity to predict sentences. there fore there will least some sentences will be predict. second solutions is to use semantic ways. in term of semantic, this allow models to choose different similarity word to re-score sentences, simple means when if there words in query dosent exist in corpus we substitute the key words to re-score it.
    
**Task 2)** Entity Retrival: <br >
> **Errors Found**: NER provide a entity extraction rules to parse entity from sentenes which can not extract right entity or assign an wrong entity <br>

>**Reasons:** In this project, NER will provide 5 types of entity 'PERSON','NUMBER','ORGANZIATON','LOCATION','OTHER'many of the query. There are alot of date numbers for example 4000 NER will automatic consider as other rather than number. Some really unusall terms cant not be classify by NER. For Example , the term phrase 'Emilio Aguinaldo and Apolinario Mabini' is extreamlly unsuall too seen as "PERSON",but this term is actually belong to 'PERSON'. in term '0.9–14' the punctuation will also affect the entity extractions,moreover if the term collaps between entiries, a example of 'Christian Dior' seems should be 'PERSON' , but it will allocate to 'ORGANZIATION'.

> **Improvemnt** Some entity can be process by rule based. coutiounsly implemanting by human design rule to preprocessing the rules and filter out  from entity 'OTHER', but it comes ineffiency and time consuming for human filter out entity<br>

**Task 3)** Answer Ranking: <br >
> **Errors Found** on reuturning answers the errors occurs when for the focus extraction,ususally rule base ia hard to produce the right part of answer sentene even if right found <br>

>**Resons** Afterwards sentence retrival, base on the rule design, for example if three 'PERSON' tags were found in answer sentences, rule will only filter out the one closet to part based on position distance.The chanllange for answer ranking is the third step to chooose the closed-class word. the varity of of answer still complicated. as example '2000 mm' can be answer for numbers  which contains units, it will incorrectly '2000'. For 'Location' same errors will comes up when 

> **Improvement ** : Beside to implementing rules to selcet rules, it might also evaualtion two condadiate words againest questions.<br>



In [None]:
t1 = time.time() - t0
print 'Running time is ',t1