# Module 1. Prepare BM25

In [1]:
import math,time
from six import iteritems
from nltk import FreqDist, word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
stop_english = set(stopwords.words('english'))

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
stemmer = SnowballStemmer("english")
wnl = WordNetLemmatizer()
start  = time.time()


class BM25_Model(object):

    def __init__(self, document_collection, K1=0.3, B=0.0, K3=1.0, EPS=0.000001, tokenizer=None):
       
        self.tokenizer =self.my_tokenize 
        self.document_collection_length = len(document_collection)
        self.avg_doc_length = sum(map(lambda x: float(len(x)), document_collection)) / self.document_collection_length
        self.document_collection = [self.tokenizer(doc) for doc in document_collection]
        self.f = []
        self.df = defaultdict(int)
        self.bm25_idf = defaultdict(float)
        self.idf_1 = defaultdict(float)
        self.average_idf = -1
        self.K1 = K1
        self.K3 = K3
        self.EPSILON = EPS
        self.B = B
        self.inverted_index = defaultdict(list)
        self.initialize()

    def initialize(self):
        for index, document in enumerate(self.document_collection):
            frequencies = FreqDist(document)
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                self.df[word] += 1
                self.inverted_index[word].append(index)

        for word, freq in self.df.items():
            self.bm25_idf[word] = math.log(self.document_collection_length - freq + 0.5) - math.log(freq + 0.5)
            # self.idf_1 = math.log((self.document_collection_length - freq))
        self.average_idf = sum(map(lambda k: float(self.bm25_idf[k]), self.bm25_idf.keys())) / len(self.bm25_idf.keys())

    def my_tokenize(self,sentence):
        """ This the is tokenize function, part of the feature engineering """
        sentence = sentence.lower()
        tokens = word_tokenize(sentence)
        lls = [stemmer.stem(i) for i in tokens if i not in stop_english]  # re.search(r'[a-z0-9]+', ii)
        lo, hi =(1,3)
        nglist = []
        assert lo <= hi
        while lo <= hi:
            if lo == 1:
                nglist += lls
            else:
                nglist += self.get_n_gram(lls, lo)
            lo += 1

        return nglist      
        
    def get_n_gram(self,ls, n):
        l = len(ls)
        nlist = []
        if n > len(ls):
            return nlist
        for i in range(l-n+1):
            nlist.append(tuple(ls[i: i+n]))
        return nlist

    def predict(self, queryX, limit=1):
        q_prediction = []
        i = 0
        for query in queryX:
            i += 1
            if i == 192:
                lll = 0
            ls = self.bm25_get_most_relevant(query)[:limit]
            if ls:
                q_prediction.append([a for a, b in ls])
            else:
                q_prediction.append([])
        return q_prediction

    def bm25_get_most_relevant(self, query):
        query_tks = self.tokenizer(query)
        scores = defaultdict(float)
        lbbbs = [0.6, 0.20, 0.20]
        for q_token in query_tks:
            if self.df[q_token] / float(self.document_collection_length) < 0.80:
                for doc_index in self.inverted_index[q_token]:
                    idf = self.bm25_idf[q_token] if self.bm25_idf[q_token] >= 0 else self.EPSILON * self.average_idf
                    top = self.f[doc_index][q_token] * (self.K1 + 1)
                    below = self.f[doc_index][q_token] + self.K1 * (
                        1 - self.B + self.B * self.document_collection_length / self.avg_doc_length)
                    vvs = idf * (top / below)
                    if isinstance(q_token, tuple):
                        vvs *= lbbbs[len(q_token)-1]
                    else:
                        vvs *= lbbbs[0]
                    scores[doc_index] += vvs
        prels = scores.items()
        sorted_socres = sorted(prels, key=lambda (k, v): v, reverse=True)

        return sorted_socres

# Module 2. Prepare for dataset

In [2]:
import json,time,sys,os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import StanfordNERTagger, StanfordPOSTagger
from nltk.internals import find_jars_within_path

#Genrate the path for tagger files
cwd=os.getcwd()+'//'

pos_tagger = StanfordPOSTagger(cwd+'wsj-0-18-left3words-distsim.tagger',
                                            cwd+'stanford-postagger.jar', encoding='utf-8')

class Prepare:
    def __init__(self, source, classification='3class'):
        self.source = source
        self.stop_english = set(stopwords.words('english'))
        self.dataset = json.loads(open(source).readline())

        if classification == '7class':
            model_file = 'english.muc.7class.distsim.crf.ser.gz'
        elif classification == '3class':
            model_file = 'english.all.3class.distsim.crf.ser.gz'
        else:
            assert False

        self.ner_tagger = StanfordNERTagger(cwd+model_file, cwd+'stanford-ner.jar', encoding='utf-8')
    
        self.pos_tagger = StanfordPOSTagger(cwd+'wsj-0-18-left3words-distsim.tagger',
                                            cwd+'stanford-postagger.jar', encoding='utf-8')
 

        self.prog_total = len(self.dataset)

    pass

    def dmerge(self, ner, pos):
        if ner[1] == 'O':
            if pos and re.search(r'[0-9]+', ner[0]):
                return ner[0], 'NUMBER'
            elif pos and pos[1] == 'CD':
                return ner[0], 'NUMBER'
            return pos
        else:
            return ner

    def _merge_tag(self, ners, poss):
        return map(self.dmerge, ners, poss)

    def putd(self, dic, key, value):
        dic[key] = value

    def launch(self, simplified=False, purpose='dev', file_name='xdev_cb7_standard.json'):
        print 'Souce Using:',self.source, 'Simplified:',simplified, 'purpose',purpose
        ts = time.time()

        assert purpose in ['dev', 'test']
        if simplified:
            assert purpose == 'dev'

        jfile = open(file_name, 'w')
        prog_i = 0.0
        for col in self.dataset:
            document_collection = col['sentences']

            if purpose == 'test':
                bm25_query_model = BM25_Model(document_collection)

                def bm25_tag(qa):
                    if bm25_query_model.predict([qa['question']])[0]:
                        qa['answer_sentence'] = bm25_query_model.predict([qa['question']])[0][0]
                    else:
                        qa['answer_sentence'] = 0

                map(lambda m: bm25_tag(m), col['qa'])
                map(lambda m: self.putd(m, 'question_tks', word_tokenize(m['question'])), col['qa'])
                map(lambda m: self.putd(m, 'ans_sent',
                                        col['sentences'][m['answer_sentence']].
                                        encode('utf-8').decode('utf-8')), col['qa'])
                map(lambda m: self.putd(m, 'ans_sent_tks', word_tokenize(m['ans_sent'])), col['qa'])

                try:
                    map(lambda m: self.putd(m, 'answer_tks', word_tokenize(m['answer'])), col['qa'])
                except:
                    pass

            elif purpose == 'dev':

                if not simplified:
                    map(lambda m: self.putd(m, 'question_tks', word_tokenize(m['question'])), col['qa'])
                    map(lambda m: self.putd(m, 'ans_sent',
                                            col['sentences'][m['answer_sentence']].
                                            encode('utf-8').decode('utf-8')), col['qa'])
                    map(lambda m: self.putd(m, 'ans_sent_tks', word_tokenize(m['ans_sent'])), col['qa'])
                map(lambda m: self.putd(m, 'answer_tks', word_tokenize(m['answer'])), col['qa'])

            try:
                if not simplified:
                    questions_ner = self.ner_tagger.tag_sents([q['question_tks'] for q in col['qa']])
                    questions_pos = self.pos_tagger.tag_sents([q['question_tks'] for q in col['qa']])
                    q_tags = map(self._merge_tag, questions_ner, questions_pos)
                    map(lambda m, val: self.putd(m, 'question_tks', val),  col['qa'], q_tags)

                    asent_ner = self.ner_tagger.tag_sents([q['ans_sent_tks'] for q in col['qa']])
                    asent_pos = self.pos_tagger.tag_sents([q['ans_sent_tks'] for q in col['qa']])
                    asent_tags = map(self._merge_tag, asent_ner, asent_pos)
                    map(lambda m, val: self.putd(m, 'ans_sent_tks', val), col['qa'], asent_tags)

                if 'answer_tks' in col['qa'][0]:
                    answer_ner = self.ner_tagger.tag_sents([q['answer_tks'] for q in col['qa']])
                    answer_pos = self.pos_tagger.tag_sents([q['answer_tks'] for q in col['qa']])
                    a_tags = map(self._merge_tag, answer_ner, answer_pos)
                    map(lambda m, val: self.putd(m, 'answer_tks', val), col['qa'], a_tags)

            except ImportError:
                col['qa'] = None
                col = None
                print('Error:', prog_i, '#####')

            if col:
                if simplified:
                    del col['sentences']
                jfile.write(json.dumps(col) + '\n')
            prog_i += 1
            sys.stdout.write('\r')
            sys.stdout.write("%f%%" % (prog_i * 100.0 / self.prog_total))
            sys.stdout.flush()

        print '\n',file_name, 'is generatred'
        print'EXEC: ', time.time() - ts
        jfile.close()
        return self.dataset

In [3]:
from nltk.tokenize import word_tokenize
from collections import defaultdict
from sklearn import metrics
from nltk.stem.snowball import SnowballStemmer
import re,json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

class ClassificationBuild:
    def __init__(self, train_data=None, test_data=None, nclass='7class'):
        print ('building dataset..')
        tt0 = time.time()

        self.nclass = nclass
        self.stemmer = SnowballStemmer("english")
        self.stop_english = set(stopwords.words('english'))

        self.test_Xo, self.test_y, self.test_by_cat = self._extract(test_data)
        self.train_Xo, self.train_y, self.train_by_cat = self._extract(train_data)
        self.ml = LogisticRegression(C=1.0)
        self.cv = CountVectorizer(stop_words='english', tokenizer=self.my_tokenize)
        self.combined = True
        print 'Finish initialization:', time.time()-tt0
        pass

    def q_qtag_c7(self, qa):
        qsent, atg = qa['question'], qa['answer_tks']

        atg = map(lambda x: x[1], atg)
        tag = 'O'

        if 'DATE' in atg:
            tag = 'DATE'
        elif 'TIME' in atg:
            tag = 'DATE'
        elif 'MONEY' in atg:
            tag = 'MONEY'
        elif 'PERCENT' in atg:
            tag = 'PERCENT'
        elif 'NUMBER' in atg:
            tag = 'NUMBER'
        elif 'PERSON' in atg:
            tag = 'PERSON'
        elif 'LOCATION' in atg:
            tag = 'LOCATION'
        elif 'ORGANIZATION' in atg:
            tag = 'ORGANIZATION'

        d = defaultdict(list)
        q = dict()
        q['q'] = qsent
        q['a'] = qa['answer']
        d[tag].append(q)

        return d

    def q_qtag_c3(self, qa):
        qsent, atg = qa['question'], qa['answer_tks']

        atg = map(lambda x: x[1], atg)
        tag = 'O'
        if 'DATE' in atg:
            tag = 'NUMBER'
        elif 'TIME' in atg:
            tag = 'NUMBER'
        elif 'MONEY' in atg:
            tag = 'NUMBER'
        elif 'PERCENT' in atg:
            tag = 'NUMBER'
        elif 'NUMBER' in atg:
            tag = 'NUMBER'
        elif 'PERSON' in atg:
            tag = 'PERSON'
        elif 'LOCATION' in atg:
            tag = 'LOCATION'
        elif 'ORGANIZATION' in atg:
            tag = 'ORGANIZATION'

        d = defaultdict(list)
        q = dict()
        q['q'] = qsent
        q['a'] = qa['answer']
        d[tag].append(q)

        return d

    def _extract(self, dataset):
        all_qa = []
        for col in dataset:
            all_qa.extend(col['qa'])

        if self.nclass == '7class':
            extract_Xy = map(self.q_qtag_c7, all_qa)
        elif self.nclass == '3class':
            extract_Xy = map(self.q_qtag_c3, all_qa)
        else:
            assert False

        def group_by_tag(a, b):
            d = defaultdict(list)
            ts = set(a.keys() + b.keys())
            for t in ts:
                d[t].extend(a[t])
                d[t].extend(b[t])
            return d

        train_set_by_cat = reduce(group_by_tag, extract_Xy)
        _Xy = []
        for cat in train_set_by_cat:
            _Xy.extend(map(lambda qa: (qa['q'], cat), train_set_by_cat[cat]))
        _X = map(lambda tt: tt[0], _Xy)
        _y = map(lambda tt: tt[1], _Xy)
        return _X, _y, train_set_by_cat

    def my_tokenize(self, sentence):
        """ This the is tokenize function, part of the feature engineering """
        sentence = sentence.lower()
        tokens = word_tokenize(sentence)
        tokenized = [self.stemmer.stem(i) for ii in tokens if re.search(r'[a-z0-9]+', i) and i not in (self.stop_english and string.punctuation)]

        return tokenized

    def _fit(self, tXo, ty):
        tX = self.cv.fit_transform(tXo)
        self.ml.fit(tX, ty)
        return

    def _pred3_(self, x, xm=None):
        if 'where' in x.lower():
            return 'LOCATION'
        elif 'what city' in x.lower():
            return 'LOCATION'
        elif 'which city' in x.lower():
            return 'LOCATION'
        elif 'when' in x.lower():
            return 'NUMBER'
        elif 'how much' in x.lower():
            return 'NUMBER'
        elif 'how many' in x.lower():
            return 'NUMBER'
        elif 'what rate' in x.lower():
            return 'NUMBER'
        elif 'whom' in x.lower() or 'who' in x.lower():
            return 'PERSON'
        elif 'which' in x.lower():
            if 'team' in x.lower():
                return 'ORGANIZATION'
            else:
                return 'O'
        else:
            return 'O'

    def _pred7_(self, x, xm=None):
        if 'where' in x.lower():
            return 'LOCATION'
        if 'what city' in x.lower():
            return 'LOCATION'
        if 'which city' in x.lower():
            return 'LOCATION'
        elif 'when' in x.lower():
            return 'DATE'
        elif 'how much' in x.lower():
            return 'MONEY'
        elif 'what rate' in x.lower():
            return 'NUMBER'
        elif 'whom' in x.lower() or 'who' in x.lower():
            return 'PERSON'
        elif 'which' in x.lower():
            if 'team' in x.lower():
                return 'ORGANIZATION'
            else:
                return 'O'
        else:
            return 'O'

    def predict(self, Xo):
        Xmat = self.cv.transform(Xo)
        assert Xo and Xmat is not None and self.ml
        if self.nclass == '7class':
            fun_pred = self._pred7_
        elif self.nclass == '3class':
            fun_pred = self._pred3_
        else:
            assert False

        if Xo and Xmat is None:
            pl = []
            for x in Xo:
                pl.append(fun_pred(x))
            return pl
        elif self.ml and Xmat is not None and not Xo:
            return self.ml.predict(Xo)
        elif self.ml and Xmat is not None and Xo:
            rule_prediction = []
            for x in Xo:
                rule_prediction.append(fun_pred(x))
            ml_prediction = self.ml.predict(Xmat)

            def overwrite(rl_p, ml_p):
                if rl_p != ml_p:
                    if rl_p == 'O':
                        return ml_p
                    elif ml_p == 'ORGANIZATION':
                        return ml_p
                    else:
                        return rl_p
                return ml_p

            return map(overwrite, rule_prediction, ml_prediction)

    def build_model_and_evaluate(self, report=True):
        print ('starting..')
        tt0 = time.time()
        # training
        self._fit(self.train_Xo, self.train_y)
        pred = self.predict(Xo=self.test_Xo)
        # score
        accuracy = metrics.accuracy_score(self.test_y, pred)
        if report:
            print('-' * 100)
            print("macro f1 score:   %0.3f" % metrics.f1_score(self.test_y, pred, average='macro'))
            print"accuracy:   %0.3f" % accuracy, '\n\n'
            print(metrics.classification_report(self.test_y, pred))
            #print(metrics.confusion_matrix)

    def __str__(self):
        if self.combined and self.ml:
            return 'Combined ########\n' + str(self.ml)
        else:
            return 'Damaged!'

    def _tagup(self, dcol):
        def upd(q):
            q['tag'] = self.predict([q['question']])[0]
        map(lambda q: upd(q), dcol['qa'])
        return dcol

    def tag_prediction(self, dataset, out_file_name='bm25_qtag_test.json'):
        data_set_for_tag = map(self._tagup, dataset)
        output_file = open(out_file_name, 'w')
        for col in data_set_for_tag:
            output_file.write(json.dumps(col) + '\n')
        return data_set_for_tag

In [4]:
#define load json file function
def get_dataset(filen):
    dataset = []
    try:
        dataset = json.load(filen)
    except:
        texts = open(filen).readlines()
        for t in texts:
            dataset.append(json.loads(t))
        if len(dataset) == 1:
            return dataset[0]
    return dataset

# Module 4. Build word2vector Model

In [5]:
import os,time
from nltk import word_tokenize
import gensim, logging
from nltk.stem import WordNetLemmatizer as WNL
from nltk.tag import PerceptronTagger
from nltk.data import find

PICKLE = "averaged_perceptron_tagger.pickle"
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
tagger = PerceptronTagger(load=False)
tagger.load(AP_MODEL_LOC)
pos_tag = tagger.tag

class W2V_Model:
    def __init__(self, model_file='word_siml_lem_s.model', dataset=None):
        t0 = time.time()
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)

        if os.path.exists(model_file):
            self.model = gensim.models.Word2Vec.load(model_file)
        else:
            wnl = WNL()
            corpus = reduce(lambda a, b: a + b, map(lambda d: d['sentences'], dataset))

            qqs = map(lambda dd: map(lambda d: d['question'], dd['qa']), dataset)
            corpus += reduce(lambda a, b: a + b, qqs)

            def idtp(tp):
                if re.search(r'V.*', tp[1]):
                    return 'v'
                return 'n'

            corpus = map(
                lambda sent: map(lambda tk: wnl.lemmatize(tk[0], idtp(tk)), pos_tag(word_tokenize(sent.lower()))),
                corpus)
            # corpus = map(lambda sent: word_tokenize(sent), corpus)

            self.model = gensim.models.Word2Vec(corpus, size=500, min_count=0, workers=8, iter=50)
            self.model.save(model_file)
            print 'Word2vector running time:', time.time()-t0

    def similarity(self, w1, w2):
        try:
            return w2v_model.model.similarity(w1, w2)
        except:
            return 0.0

    def ws_similarity(self, wls1, wls2):
        try:
            return w2v_model.model.n_similarity(wls1, wls2)
        except:
            return 0.0


w2v_train = get_dataset('QA_dev.json') + get_dataset('QA_test.json')  
w2v_model = W2V_Model(dataset=w2v_train)

Word2vector running time: 71.5911831856


### Module 5.1 Define Tags extraction

In [6]:
import sys,json

class QASS:
    def __init__(self, dataset):
        map(lambda (did, col): self.mark(did, col), enumerate(dataset))
        self.all_qa = []
        for col in dataset:
            self.all_qa.extend(col['qa'])

    def mark(self, did, col):
        def mk(qa, did, qid):
            qa['doc_id'] = did
            qa['qa_id'] = qid

        map(lambda (qid, qa): mk(qa, did, qid), enumerate(col['qa']))

    def actual_qtype_tag_7(self, qa):
        try:
            atg = qa['answer_tks']
            atg = map(lambda x: x[1], atg)
            tag = 'O'

            if 'DATE' in atg:
                tag = 'DATE'
            elif 'TIME' in atg:
                tag = 'DATE'
            elif 'MONEY' in atg:
                tag = 'MONEY'
            elif 'PERCENT' in atg:
                tag = 'PERCENT'
            elif 'NUMBER' in atg:
                tag = 'NUMBER'
            elif 'PERSON' in atg:
                tag = 'PERSON'
            elif 'LOCATION' in atg:
                tag = 'LOCATION'
            elif 'ORGANIZATION' in atg:
                tag = 'ORGANIZATION'

            qa['tag'] = tag
        except:
            pass
        return qa

    def get_qass(self, enable_actag=False):
        if enable_actag:
            qass = map(lambda xt: self.actual_qtype_tag_7(xt), self.all_qa)
        else:
            qass = self.all_qa
        return qass



### Module 5.2. Define Segement grammar

In [7]:
from nltk import RegexpParser, Tree
from nltk.stem import WordNetLemmatizer as WNL

wnl = WNL()

grammar = """
    # NNP:
    # {<DT><NNP>+}

    CORE:
    <W.*>{<N.*>+}<V.*>

    ETY_ORG:
    {<DT><ORGANIZATION>+<OF><LOCATION>+}
    {<THE><ORGANIZATION>+<BRA><.*>+<BRB>}
    {<THE><ORGANIZATION>+}
    {<ORGANIZATION>+<CC><NNP>}
    {<ORGANIZATION>+}
    {<NNP><NN><OF><LOCATION>}
    {<DT><NNP|CC|OF>+<NNP>}


    # JJ:
    # {<AN><NUMBER>}

    DATE:
    {<DATE>+<NUMBER|,>+<NUMBER|DATE>}
    DATE:
    {<DATE>+<NUMBER>?}
    DATE:
    {<DATE><NUMBER>+}
    DATE:
    {<NUMBER>+<DATE>}


    ETY_DATE:
    {<DATE>+<,>+<DATE>+}
    {<DATE>+<,>?<DATE>+}
    {<DATE>+}

    ETY_TIME:
    {<TIME>+}    

    ETY_MNY:
    {<MONEY><MONEY|NUMBER>+}
    {<MONEY|NUMBER>+<MONEY>}
    {<MONEY>+}

    ETY_NUM:
    {<NUMBER>+}

    ETY_PSN:
    {<PERSON>+}

    ETY_LOC:
    {<DT>?<LOCATION>+}

    ETY_PRCT:
    {<PERCENT>+}

    ETY_STRS:
    <``>{<.*>+}<''>

    STRESS:
    <BRA>{<ETY.*>}<BRB>


    MNE:
    {<THE>?<NNP><JJ>+<NN>+}
    {<THE>?<NNP><NNP|DT|OF>+}

    MNE2:
    {<DT><N.*>+<OF><J.*><N.*>+}
    {<J.*>+<N.*>+}

    MNE4:
    {<AN><N.*>+}
    {<J.*>?<N.*>+}

    MNE5:
    {<J.*>+}

    # NX:
    # {<NE>+<OF>?<DT>?<NE>+}

    # BULLSHIT:
    # {<N.*>+<OF>?<DT>?<J.*>?<N.*>+}
    # {<N.*>+<IN>?<DT>?<J.*>+<N.*>+}
    # {<NNP>+<IN>?<DT>?<J.*>?<NNP>+}
    # {<N.*>+<CC>?<DT>?<J.*>+<N.*>+}
    # {<N.*>+<CC>?<DT>?<J.*>?<N.*>+}
    # <BRA>{<.*>+}<BRB>


"""

grammar2 = """
    PSV:
    {<BEV><IN|RB>?<VBN|VBD>+<IN|RB>?}

    PRED:
    {<V.*>+<IN|RB>?}

    PX:
    {<IN>+}

    # SUBP:
    # {(?!<VV>)<.*>+(?!<VV>)}<PRED|PSV>
    # 
    # OBJP:
    # <PSV|PRED>{(?!<VV>)<.*>+(?!<VV>)}

"""

import math
from collections import OrderedDict

rg_parser = RegexpParser(grammar=grammar)
rg_parser2 = RegexpParser(grammar=grammar2)


### Module 5.3. Answer extraction and ranking

In [8]:
from nltk.metrics import *
import re


class QA:
    def __init__(red, qa):
        red.qa = qa
        red.qmask = map(lambda l: OrderedDict({'seq': l[0]}), enumerate(qa['question_tks']))
        map(lambda dic, l: red.put(dic, 'ori', l[0]), red.qmask, qa['question_tks'])
        map(lambda dic, l: red.put(dic, 'tag', l[1]), red.qmask, qa['question_tks'])
        map(lambda dic, xt: red.put(dic, 'lem', wnl.lemmatize(xt[0].lower(), red.vb(xt[1]))),
            red.qmask, qa['question_tks'])

        _qety = map(red._ety_expand, rg_parser.parse(map(lambda m: (m['lem'], m['tag']), red.qmask)))
        red.core = red.identify_q_core(_qety)

        red.q_type = [t[0] for t in qa['question_tks'] if re.search(r'^W.*', t[1])]
        red.highlight = red.highlightq()

        red.a_length = len(qa['ans_sent_tks'])

        red.a_tokens_only = map(lambda x: x[0], qa['ans_sent_tks'])
        red.qa['ans_sent_tks'] = map(lambda qt: (qt[1][0], red.preprocess(qt)), enumerate(qa['ans_sent_tks']))
        red.amask = map(lambda l: OrderedDict({'seq': l[0]}), enumerate(qa['ans_sent_tks']))

        map(lambda dic, l: red.put(dic, 'ori', l[0]), red.amask, red.qa['ans_sent_tks'])
        map(lambda dic, l: red.put(dic, 'tag', l[1]), red.amask, red.qa['ans_sent_tks'])

        red.build_mask()

        red.scoring()

    def highlightq(self):
        qtks = map(lambda m: (m['ori'], m['tag']), self.qmask)
        grm_temp = map(self._ety_expand, rg_parser2.parse(qtks))
        predicates = []
        sbe = {'is', 'are', 'was', 'were', 'am', 'been', 'be', 'being'}
        for grm in grm_temp:
            if grm and grm[0][1] in {'PRED', 'PSV'}:
                d = dict()
                d['type'] = grm[0][1]
                d['voc'] = map(lambda p: wnl.lemmatize(p, 'v'),
                               list(set(map(lambda g: g[0], grm)) - sbe))
                predicates.append(d)

        return predicates

    def put(self, dic, cat, content):
        dic[cat] = content

    def adds(self, dic, cat, floatv):
        dic[cat] = dic.get(cat, 0.0) + floatv

    def preprocess(self, etg):
        seq, tg = etg
        if tg[0] == 'The' and seq == 0:
            return 'FUK'
        if tg[0] == '(':
            return 'BRA'
        if tg[0] == ')':
            return 'BRB'
        if tg[0] == 'of' and tg[1] == 'IN':
            return 'OF'
        if tg[0] == 'The' and tg[1] == 'DT':
            return 'THE'
        if tg[0] in ['an', 'a'] and tg[1] == 'DT':
            return 'AN'
        if re.search(r'[0-9]+', tg[0]):
            return 'NUMBER'
        if tg[0] in ['nm', 'mi', 'km', 'mile', 'miles', 'per', '%']:
            return 'NUMBER'
        if tg[0] == 'over':
            return 'OV'
        if tg[0] in ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
                     'September', 'October', 'November', 'December']:
            return 'DATE'
        if tg[0] in ['is', 'are', 'was', 'were', 'am', 'been', 'be', 'being']:
            return 'BEV'
        if tg[0] in ['such']:
            return 'INX'
        if tg[0] in ['for', 'to']:
            return 'FT'
        if tg[0] in ['[', ']']:
            return 'O'
        return tg[1]

    def _semi(self, a, b):
        simi_rate = 1 - edit_distance(a, b) / float(len(a + b)) * 2
        return simi_rate > 0.8

    def _ctx(self, a):
        q_lem = map(lambda m: m['lem'], self.qmask) + ['such', 'as']
        if self.core and any(map(lambda qt: self._semi(qt, a), self.core)):
            return '**'
        if any(map(lambda qt: self._semi(qt, a), q_lem)):
            return '*'
        return '_'

    def vb(self, tk):
        if re.search(r'^V.*', tk):
            return 'v'
        return 'n'

    def build_mask(self):

        map(lambda dic, xt: self.put(dic, 'lem', wnl.lemmatize(xt[0].lower(), self.vb(xt[1]))),
            self.amask, self.qa['ans_sent_tks'])
        # enu_tks = map(lambda m: m['lem'], self.mask)

        map(lambda dic, xt: self.put(dic, 'ctx', self._ctx(xt)),
            self.amask, map(lambda m: m['lem'], self.amask))

        ety_temp = map(self._ety_expand, rg_parser.parse(map(lambda m: (m['lem'], m['tag']), self.amask)))
        aet = []
        for a in ety_temp:
            aet.extend(a)

        map(lambda dic, xt: (self.put(dic, 'ety1', xt[1]), self.put(dic, 'enum', xt[2])),
            self.amask, aet)

        grm_temp = map(self._ety_expand, rg_parser2.parse(map(lambda m: (m['ori'], m['tag']), self.amask)))
        grm = []
        for a in grm_temp:
            grm.extend(a)

        map(lambda dic, xt: (self.put(dic, 'grm', xt[1]), self.put(dic, 'gnum', xt[2])),
            self.amask, grm)

    def score_ctx_dup(self, tt):
        if tt == '*':
            return -1.0
        return 0.0

    def _ety_expand(self, tn):
        if isinstance(tn, Tree):
            tnflat = tn.flatten()
            return map(lambda c: (c[0], tn.label(), len(tnflat)), tnflat)
        else:
            return [(tn[0], '%', 1)]

    def identify_q_core(self, qls):
        for i in range(len(qls)):
            if qls[i] and qls[i][0][1] == 'CORE':
                return map(lambda c: c[0], qls[i])
        return None

    def scoring(self):

        # score on duplication
        map(lambda dic, x: self.put(dic, 'score1', self.score_ctx_dup(x)),
            self.amask, map(lambda m: m['ctx'], self.amask))

        # score on distance
        ctxx = map(lambda m: (m['seq'], m['lem'], m['ctx']), self.amask)
        ssq = []
        extr = 0.0
        for seq, lem, ctx in ctxx:
            # if self.core and lem in
            if ctx == '*':
                ssq.append(seq)
            if ctx == '**':
                ssq.append(seq)
                extr += 1.0
            elif ctx != '*' and ssq:
                map(lambda dic, xt: self.adds(dic, 'score2', self.score_exp_distrb(ssq, xt, extr)),
                    self.amask, ctxx)
                ssq = []
                extr = 0.0

        # score on w2v
        lem_ls = map(lambda m: (m['lem']), self.amask)
        map(lambda dic, xt: self.adds(dic, 'score3', w2v_model.ws_similarity([xt], self.core) * 3.5),
            self.amask, lem_ls)

        map(lambda m: self.adds(m, 'score5', self.score_type(m) * 1.0), self.amask)

    def score_type(self, m):
        if self.qa['tag'] == 'NUMBER' and m['ety1'] in ['ETY_PRCT', 'ETY_NUM', 'ETY_DATE', 'ETY_MNY']:
            return 1.0
        if self.qa['tag'] in ['PERSON', 'ORGANIZATION'] and m['ety1'] in ['ETY_PSN', 'ETY_ORG', 'MNE']:
            return 1.0
        return 0.0

    def summary(self):
        s = 0
        end = len(self.amask)
        wcol = []
        try:
            answer = map(lambda x: x[0], self.qa['answer_tks'])
        except:
            answer = []

        while s < end:

            k = dict()
            k['ori'] = []
            k['ety'] = None
            k['length'] = 0
            k['dup'] = 0
            k['score'] = 0.0

            for i in range(s, end):

                if self.amask[i]['ety1'] != '%':
                    if not k['ety']:
                        k['ety'] = self.amask[i]['ety1']
                        k['length'] = self.amask[i]['enum']
                    k['ori'].append(self.amask[i]['ori'])

                    assert k['ety'] == self.amask[i]['ety1']
                    k['score'] += self.amask[i]['score1'] + self.amask[i].get('score2', 0.0) \
                                  + self.amask[i]['score3'] + self.amask[i]['score5']

                if self.amask[i]['ctx'] == "*" or self.amask[i]['ctx'] == "**":
                    k['dup'] += 1

                try:
                    assert len(k['ori']) < k['length']
                    assert self.amask[i]['ety1'] != '%'
                except:
                    if k['ori'] and not (k['dup'] == k['length'] == 1):
                        k['score'] /= (k['length'] + 1.0)
                        wcol.append(k)
                    s = i + 1
                    k = dict()
                    k['ori'] = []
                    k['ety'] = None
                    k['length'] = 0
                    k['dup'] = 0
                    k['score'] = 0

        wcol = sorted(wcol, key=lambda x: x['score'], reverse=True)
        ans_pool = map(lambda x: x['ori'], wcol)
        # prepare for csv results
        results = dict()
        pred_ans = []
        if ans_pool:
            for j in ans_pool[0]:
                if j == ',':
                    pred_ans.append('-COMMA-')
                elif j == '\"':
                    pass
                else:
                    pred_ans.append(j)

        try:
            results['id'] = str(self.qa['id'])
            results['answer'] = ' '.join(pred_ans)
        except:
            results['q'] = self.qa['question']
            results['a'] = str(self.qa['answer_tks'])
            results['apool'] = str(ans_pool)
            results['cort'] = str(int(answer in ans_pool))

        if ans_pool:
            if answer in ans_pool and answer != ans_pool[0]:
                llllllllll = 0
            if answer not in ans_pool:
                kkkkkkkkkkk = 0
            return answer == ans_pool[0], answer in ans_pool, results
        else:
            return False, False, results

    def hlt(self, sec_half, bp, xt, aa=2.0):
        seq = xt['seq']

        if sec_half and seq > bp:
            x = seq - bp - 1
        elif (not sec_half) and seq < bp:
            x = bp - seq - 1
        else:
            return 0.0

        og = 1.0

        f = aa ** 1.0 / math.sqrt(2.0 * og ** 2.0 * math.pi) * math.pow(math.e, -x ** 2 / (2 * og ** 2))
        return f

    def score_exp_distrb(self, range, xt, extra=0.0):
        lo, hi = range[0], range[-1]
        seq, _, ctx = xt
        # pps = math.fabs(self.a_length/2.0 - seq) / math.sqrt(self.a_length)

        if seq > hi:
            x = seq - hi - 1
        elif seq < lo:
            x = lo - seq - 1
        else:
            return 0.0

        a = float(hi - lo) + extra
        og = 1.0
        # print og, a

        f = a ** 1.0 / math.sqrt(2.0 * og ** 2.0 * math.pi) * math.pow(math.e, -x ** 2 / (2 * og ** 2))
        return f


### Module 5.4. Report on QA system

In [None]:
import csv

class answer_writer:
    def __init__(self, dev, qass, report_file='answer.csv'):
        self.correct = 0
        self.prog_i = 0.0
        self.prog_total = len(qass)

        file_csv = open(report_file, 'w')
        if dev:
            field_names = ['cort', 'a', 'apool', 'q']
        else:
            field_names = ['id', 'answer']
        self.writer = csv.DictWriter(file_csv, fieldnames=field_names)
        self.writer.writeheader()

    def guess(self, qa, ):

        qa_process = QA(qa)
        a, b, c = qa_process.summary()

        self.prog_i += 1
        sys.stdout.write('\r')
        sys.stdout.write("%f%%" % (self.prog_i * 100.0 / self.prog_total))
        sys.stdout.flush()
        self.writer.writerow({k: v.encode('utf8') for k, v in c.items()})
        return a, b

    def report(self):
        pre = map(self.guess, qass[0:])
        print ''
        print sum(map(lambda x: x[0], pre)), 'correct', sum(map(lambda x: x[1], pre)), 'in pool'

#### Prepare data and build for classifier

In [None]:
# trainning set: query + answer ner tag
train_cb3_simple_dev = 'xtrain_simple_cb3.dev.json'
if os.path.exists(train_cb3_simple_dev):
    train_cb3_simple_dev = get_dataset(train_cb3_simple_dev)
    print train_cb3_simple_dev ,'is genreated!'
else:
    qA_train_cb3 = Prepare('QA_train.json', classification='3class')
    train_cb3_simple_dev = qA_train_cb3.launch(simplified=True, purpose='dev', file_name=cwd+'xtrain_simple_cb3.dev.json')

# ner processed dev set, contains 7class ner tag (question sent / ans sent)
dev_cb7_standard_dev = 'xdev_standard_cb7.dev.json'
if os.path.exists(dev_cb7_standard_dev):
    dev_cb7_standard_dev = get_dataset(dev_cb7_standard_dev)
    print dev_cb7_standard_dev ,'is genreated!'
else:
    qA_dev_cb7 = Prepare('QA_dev.json', classification='7class')
    dev_cb7_standard_dev = qA_dev_cb7.launch(simplified=False, purpose='dev', file_name=cwd+'xdev_standard_cb7.dev.json')

classification3 = ClassificationBuild(train_data=train_cb3_simple_dev, test_data=dev_cb7_standard_dev,nclass='3class')
classification3.build_model_and_evaluate()

Souce Using: QA_train.json Simplified: True purpose dev
4.722222%

#### Output QA results on dev set (BM25 tagged sent, ML classifier tagged qtype) 

In [None]:
# Given the BM25 prediction, ML tagged qtype; run answer extraction on dev set
dev_bm25_qtag_c3 = 'bm25_qtag_c3_dev.json'
if os.path.exists(dev_bm25_qtag_c3):
    dev_bm25_qtag_c3 = get_dataset(dev_bm25_qtag_c3)
    print dev_bm25_qtag_c3, 'is ready!'
else:
    dev_cb7_standard_test = 'xdev_standard_cb7.test.json'
    if os.path.exists(dev_cb7_standard_test):
        dev_cb7_standard_test =get_dataset(dev_cb7_standard_test)
    else:
        qA_dev_cb7 = Prepare('QA_dev.json', classification='7class')
        dev_cb7_standard_test = qA_dev_cb7.launch(simplified=False, purpose='test',
                                                  file_name='xdev_standard_cb7.test.json')
    dev_bm25_qtag_c3 = classification3.tag_prediction(dataset=dev_cb7_standard_test,
                                                      out_file_name='bm25_qtag_c3_dev.json')

qass = QASS(dataset=dev_bm25_qtag_c3).get_qass(enable_actag=True)
answer_writer(dev=True, qass=qass).report()

#### Output QA results on test dataset

In [None]:
# Given the BM25 prediction, ML tagged qtype; run answer extraction on test set
# This is what submitted to kaggle

# ner and BM25 processed dev set, contains 7class ner tag (question sent / ans sent)
test_cb7_standard = 'xtest_standard_cb7.test.json'
if os.path.exists(test_cb7_standard):
    test_cb7_standard = get_dataset(test_cb7_standard)
    print test_cb7_standard, 'is ready!'
else:
    qA_test_cb7 = Prepare('QA_test.json', classification='7class')
    test_cb7_standard = qA_test_cb7.launch(simplified=False, purpose='test', file_name='xtest_standard_cb7.test.json')

#
test_bm25_qtag_c3 = 'bm25_qtag_c3_test.json'
if os.path.exists(test_bm25_qtag_c3):
    test_bm25_qtag_c3 = get_dataset(test_bm25_qtag_c3)
    print'bm25_qtag_c3_test.json', 'is ready!'
else:
    test_bm25_qtag_c3 = classification3.tag_prediction(dataset=test_cb7_standard, 
                                                       out_file_name='bm25_qtag_c3_test.json')
qass = QASS(dataset=test_bm25_qtag_c3).get_qass(enable_actag=False)
answer_writer(dev=False, qass=qass)

In [None]:
print 'EXEC: ', time.time() - start