In [102]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
import re


In [103]:
stopwordSet = set(stopwords.words('english'))
punctuationSet = set(string.punctuation)
porterStemmer = PorterStemmer()

### PreProcessing 

In [104]:
def wordTokenize(line):
    wordsInLine = word_tokenize(line)
    return wordsInLine

def removeStopwords(line):
    wordsInLine = wordTokenize(line)
    lineWithoutStopwords = [word.lower() for word in wordsInLine if word.lower() not in stopwordSet and word.lower() not in punctuationSet]
    return lineWithoutStopwords

def stemming(line):
    wordsInLine = wordTokenize(line)
    ps = PorterStemmer()
    lineAfterStemming = [ps.stem(word) for word in wordsInLine]
    return lineAfterStemming


In [105]:
def parseParagraph(paragraph_fileName):
    para_dict = {}
    with open(paragraph_fileName) as f:
        parts = f.read().split("TEXT:")
        headline = parts[0].splitlines()[0]
        date = parts[0].splitlines()[1]
        storyId = parts[0].splitlines()[2]
        text = sent_tokenize(parts[1].lstrip("\n").replace("\n", " "))
        para_dict[(headline, date, storyId)] = text
    return para_dict

In [106]:
def removeStopwordsAndTagPos(paragraph_dict):
    paragraphWithoutStopwords = {}
    paragraphWithPos = {}
    for key in paragraph_dict:
        text = paragraph_dict[key]
        for line in text:
            lineWithoutStopwords = removeStopwords(line)
            paragraphWithoutStopwords[line] = lineWithoutStopwords
            paragraphWithPos[line] = nltk.pos_tag(lineWithoutStopwords)

    return paragraphWithoutStopwords, paragraphWithPos

In [107]:
def camelCase(word):
    return word != word.lower() and word != word.upper() and "_" not in word

def sentWithoutCamel(sent):
    sentWithoutCamel = []
    for word in sent:
        if camelCase(word):
            sentWithoutCamel.append(word)
    return sentWithoutCamel

In [108]:
name_list = []
location_list = []
month_list = []
time_list = []
organization_list = []
occupation_list = []
preposition_list = []
location_preposition_list = []
numberInWords_list = []
relative_timeList = ["today","tomorrow","yesterday","last week","last month","last year","next week","next month","next year"]

### Semantic Classes

In [109]:
def semanticClasses(name_filename):
    with open(name_filename+"names.txt") as f:
        name_list.append(f.read().splitlines())

    with open(name_filename+"location.txt") as f:
        location_list.append(f.read().splitlines())

    with open(name_filename+"month.txt") as f:
        month_list.append(f.read().lower().splitlines())

    with open(name_filename+"time.txt") as f:
        time_list.append(f.read().lower().splitlines())

    with open(name_filename+"occupation.txt") as f:
        occupation_list.append(f.read().lower().splitlines())

    with open(name_filename+"location_preposition.txt") as f:
        location_preposition_list.append(f.read().lower().splitlines())

    with open(name_filename+"preposition.txt") as f:
        preposition_list.append(f.read().lower().splitlines())

    with open(name_filename+"numberInWords.txt") as f:
        numberInWords_list.append(f.read().lower().splitlines())


In [110]:
def wordMatch(ques, line, paraPOS_dict):
    wordsInQuestion = wordTokenize(ques)
    roots = set()
    for word in wordsInQuestion:
        roots.add(porterStemmer.stem(word))
    verbMatch = 0
    rootMatch = 0
    if line in paraPOS_dict:
        scoreOfLine = {}
        for (word, tag) in paraPOS_dict[line]:
            if 'V' in tag:
                ver_root = porterStemmer.stem(word)
                if ver_root in roots:
                    verbMatch += 6
            else:
                word_root = porterStemmer.stem(word)
                if word_root in roots:
                    rootMatch += 3
        scoreOfLine[line] = verbMatch + rootMatch
    return verbMatch + rootMatch


### Who Rules

In [111]:
def containsNoun(ques):
    status = False
    pNList = sentWithoutCamel(ques)
    for p_n in pNList:
        if any(p_n in s for s in name_list):
            status = True
            return status 
        
    return status


In [112]:
def contains_word_name(ans_sent):
    flg = False
    for word in ans_sent:
        if word == "name":
            flg = True
            break
    return flg

def contains_proper_noun(ques):
    flg = False
    for word in ques:
        if (camelCase(word)):
            flg = True
            break
    return flg

In [113]:
def contains_name_occupation(sent):
    flg = False
    nounList = sentWithoutCamel(sent)
    for noun in nounList:
        if any(noun in s for s in name_list):
            flg = True
            return flg 
        
    for word in sent:
        if any(word in s for s in occupation_list):
            flg = True
            return flg
        
    return flg

In [114]:
def contains_month(question):
    wordInQuestion = wordTokenize(question)
    status = False
    for word in wordInQuestion:
        if word.lower() in month_list[0]:
            status = True
    return status

def contains_relativeTime(sent):
    wordsInSent = wordTokenize(sent)
    status = False
    for word in wordsInSent:
        if word.lower() in relative_timeList:
            status = True
    return status

def contains_head(wordsInQues, wordsInSent):
    status = False
    wordsInSentLower = [word.lower() for word in wordsInSent]
    for word in wordsInQues:
        if word.lower() in wordsInSentLower:
            status = True
    return status

def who(quesWithoutStop, sentWithoutStop, paraPOS_dict):
    score = 0
    flg = False
    if(not containsNoun(quesWithoutStop) and containsNoun(sentWithoutStop)):
        score += 6
    if ( not containsNoun(quesWithoutStop) and contains_word_name(sentWithoutStop)):
        score += 4
    status = contains_name_occupation(sentWithoutStop)
    if status:
        score += 4
    return score 


### When Rules

In [115]:
def contains_time_list(sent):
    wordsInSent = wordTokenize(sent)
    wordsInSent = [word for word in wordsInSent if word not in punctuationSet]
    for word in wordsInSent:
        if word.lower() in time_list[0]:
            return True
    return False

def contains_time_other(sent, custom_list):
    wordsInSent = wordTokenize(sent)
    wordsInSent = [word for word in wordsInSent if word not in punctuationSet]
    for word in wordsInSent:
        if word.lower() in custom_list:
            return True
    return False

def when_rule(ques, sent):
    score = 0
    if(contains_time_list(sent)):
        score += 4
    if(contains_time_other(ques,["last"]) and contains_time_other(sent,["first","last","since","ago"])):
        score += 20
    if(contains_time_other(ques,["start","begin"]) and contains_time_other(sent,["start","begin","since","year"])):
        score += 20
    return score


### What rules

In [116]:
def what_rule(ques, sent):
    wordsInQues = wordTokenize(ques)
    wordsInSent = wordTokenize(sent)
    score = 0
    if(contains_month(ques) and contains_relativeTime(sent)):
        score += 3
    
    for ques_word in wordsInQues:
        if ques_word.lower() == "kind":
            for sent_word in wordsInSent:
                if ((sent_word == "call") or (sent_word == "from")):
                    score += 4

    for ques_word in wordsInQues:
        if ques_word == "name":
            for sent_word in wordsInSent:
                if ((sent_word == "name") | (sent_word == "call") | (sent_word == "known")):
                    score += 20
    wordsAfterOfInQues = []
    for ind,ques in enumerate(wordsInQues):
        if ques.lower() == "of":
            for rem_ind in range(ind+1,len(wordsInQues)):
                if wordsInQues[rem_ind].lower() in name_list[0]:
                    wordsAfterOfInQues.append(wordsInQues[rem_ind].lower())

    for ind,ques in enumerate(wordsAfterOfInQues):
        if ques == "name":
            if wordsInQues[ind+1].lower() in preposition_list[0]:
                sentWithoutStop = removeStopwords(sent)
                if contains_proper_noun(sentWithoutStop):
                    if contains_head(wordsAfterOfInQues,wordsInSent):
                        score += 20

    return score
                

### Why rules

In [117]:
def why_rule(sent, BESTlines, text_list, index):
    wordsInASent = wordTokenize(sent)
    score = 0

    if sent in BESTlines:
        score += 3

    if sent not in BESTlines:
        if (index + 1) < len(text_list):
            if text_list[index+1] in BESTlines:
                score += 3
        if text_list[index - 1] in BESTlines:
            score += 4
    for word in wordsInASent:
        if word.lower() == "want":
            score+= 4
        if ((word.lower() == "so") | (word.lower() == "because")):
           score += 4

    return score

def get_bestLines(ques, text_list, paraPOS_dict):
    scoresOfLine = {}
    BESTLines = []
    for line in text_list:
        scoresOfLine[line] = wordMatch(ques, line, paraPOS_dict)
    scoresOfLine = {line:score for line,score in scoresOfLine.items() if score is not None}
    maxIndex = max(scoresOfLine, key=scoresOfLine.get)
    
    mxScore = scoresOfLine[maxIndex]
    twoThirdScore = mxScore * 2 / 3
    for line in scoresOfLine:
        if scoresOfLine[line] >= twoThirdScore:
            BESTLines.append(line[0])
    return BESTLines


### Where Rules

In [118]:
def contains_location_preposition(sent):
    wordsInSent = wordTokenize(sent)
    wordsInSent = [word for word in wordsInSent if word not in punctuationSet]
    for word in wordsInSent:
        if word.lower() in location_preposition_list[0]:
            return True
    return False


def contains_location_list(sent):
    wordsInSent = wordTokenize(sent)
    wordsInSent = [word for word in wordsInSent if word not in punctuationSet]
    for word in wordsInSent:
        if word.lower() in location_list[0]:
            return True
    return False


def where_rule(ques, sent):
    score = 0
    if(contains_location_list(sent)):
        score += 6
    if(contains_location_preposition(sent)):
        score += 4
    return score


def contains_word(ques, check):
    wordsInQues = wordTokenize(ques)
    wordsInQues = [word for word in wordsInQues if word not in punctuationSet]
    for word in wordsInQues:
        if word.lower() == check:
            return True
    return False


def dateLine(question):
    score = 0
    if (contains_word(question, "happen")):
        score = score + 4
    if (contains_word(question, "take") and contains_word(question, "place")):
        score = score + 4
    if (contains_word(question, "this")):
        score = score + 20
    if (contains_word(question, "story")):
        score = score + 20
    return score


### How Rules

In [119]:
def how_rule(sent, question, storyPOS_dict):

    scoreOfHowRule = 0
    wordsInASentence = wordTokenize(sent.lower())
    wordsInAQuestion = wordTokenize(question.lower())

    scoreOfWordMatch = wordMatch(question, sent, storyPOS_dict)
    scoreOfHowRule = scoreOfHowRule + scoreOfWordMatch + 10
    if "cost" in wordsInAQuestion or "much" in wordsInAQuestion or "many" in wordsInAQuestion or "long" in wordsInAQuestion:

        if (("dollar" in wordsInASentence) or ("cost" in wordsInASentence) or (re.match(r'\d+', sent) != None) or ("weigh" in wordsInASentence)):
            scoreOfHowRule = scoreOfHowRule + 12
        for word in wordsInASentence:
            if word in numberInWords_list:
                scoreOfHowRule = scoreOfHowRule + 6

    if "age" in wordsInAQuestion or "old" in wordsInAQuestion:
        if (re.search(r'\d+', sent) != None):
            scoreOfHowRule = scoreOfHowRule + 20
        for word in wordsInASentence:
            if word in numberInWords_list:
                scoreOfHowRule = scoreOfHowRule + 20

    return scoreOfHowRule


In [120]:
def matchForAccuracy(real_answer, predicted_answer):
    real_answer = real_answer.lower()
    predicted_answer = predicted_answer.lower()
    real_answer = real_answer.split()
    predicted_answer = predicted_answer.split()
    score = 0
    for word in predicted_answer:
        if word in real_answer:
            score += 1
    if(len(predicted_answer) == 0):
        return 0    
    return score / len(predicted_answer)

In [121]:
def data_forward(questions_data, story_dict):
    storyWithoutStopWords_dict, storyPOS_dict =removeStopwordsAndTagPos(
        story_dict)
    quest_words = set(['what', 'when', 'why', 'who',
                      'where', 'whose', 'which', 'how'])
    total_accuracy = 0

    
    for question in questions_data:
        for story_key in story_dict:
            text_list = story_dict[story_key]
            questionWithoutStopWords = removeStopwords(question[1])
            BESTlines = get_bestLines(question[1], text_list, storyPOS_dict)
            acc = 0
            ans = ""
            for q in question[1].split():
                if q.lower() in quest_words:
                    if q.lower() == 'who' or q.lower() == 'whose':
                        max_score_who = 0
                        answer = ""
                        for sent in text_list:
                            scoreOfASentence = wordMatch(
                                question[1], sent, storyPOS_dict)
                            sentWithoutStopWords = removeStopwords(sent)
                            who_score = who(
                                questionWithoutStopWords, sentWithoutStopWords, storyPOS_dict)
                            who_score += scoreOfASentence

                            if (max_score_who < who_score):
                                max_score_who = who_score
                                answer = sent
                        ans = ""

                        str1_list = word_tokenize(answer)
                        str2_list = word_tokenize(question[1].lower())
                        for word in str1_list:
                            if word.lower() not in str2_list and word.lower() not in punctuationSet and not word.islower():
                               ans = ans+" "+word
                        if ans == "":
                            for word in str1_list:
                                if word.lower() not in str2_list and word.lower() not in punctuationSet:
                                   ans = ans+" "+word

                        if 'being' in question[0] and 'being' in ans:
                            ans = ans.split('being')[1]

                        print ("QuestionID:", question[0])
                        print ("Answer:", ans)

                        break
                    if (q.lower() == 'when'):
                        max_score_when = 0
                        date = ""
                        count = 0
                        for sent in text_list:
                            scoreOfASentence = wordMatch(
                                question[1], sent, storyPOS_dict)
                            when_score = when_rule(
                                question[1], sent)
                            when_score += scoreOfASentence
                            dateline_score = dateLine(question[1])
                            first_sent = text_list[0]
                            if (max_score_when < when_score):
                                max_score_when = when_score
                                answer = sent

                            if (when_score == max_score_when):
                                count = count+1

                            if count == len(text_list):
                                answer = first_sent
                            if max_score_when == 0:
                                max_score_when = dateline_score
                                date = story_key[1].split(":")[1].lstrip()
                                answer = sent
                        date = ""
                        if date == "":
                            ans = ""
                            str1_list = word_tokenize(answer)
                            str2_list = word_tokenize(question[1].lower())
                            for word in str1_list:
                                if word.lower() not in str2_list and word.lower() not in punctuationSet:
                                   ans = ans+" "+word

                            if 'being' in question[0] and 'being' in ans:
                                ans = ans.split('being')[1]

                            print( "QuestionID:", question[0])
                            if (re.search(r'\d+', ans) != None):
                                print("Answer:", re.sub('[^\d.]', ' ', ans))
                            else:
                                print("Answer:", ans)
                        else:
                            ans = ""
                            str1_list = word_tokenize(answer)
                            str2_list = word_tokenize(question[1].lower())
                            for word in str1_list:
                                if word.lower() not in str2_list and word.lower() not in punctuationSet:
                                   ans = ans+" "+word
                            print("QuestionID:", question[0])
                            print("Answer:", date)
                        break
                    if(q.lower() == 'where'):
                        max_score_where = 0
                        date = ""
                        count = 1
                        for sent in text_list:
                            scoreOfASentence = wordMatch(
                                question[1], sent, storyPOS_dict)
                            where_score = where_rule(
                                question[1], sent)
                            where_score += scoreOfASentence
                            dateline_score = dateLine(question[1])
                            first_sent = text_list[0]

                            if (max_score_where < where_score):
                                max_score_where = where_score
                                answer = sent

                            if (where_score == max_score_where):
                                count = count+1

                            if count == len(text_list):
                                answer = first_sent

                            if max_score_where == 0:
                                max_score_where = dateline_score
                                date = story_key[1].split(":")[1].lstrip()
                                answer = sent
                            date = ""

                        if date == "":
                            ans = ""
                            str1_list = word_tokenize(answer)
                            str2_list = word_tokenize(question[1].lower())
                            for word in str1_list:
                                if word.lower() not in str2_list and word.lower() not in punctuationSet and not word.islower():
                                   ans = ans+" "+word

                            if 'being' in question[0] and 'being' in ans:
                                ans = ans.split('being')[1]

                            if ('where' in question[0]) and 'from ' in ans:
                                ans = ans[ans.index('from '):]

                            print("QuestionID:", question[0])
                            print ("Answer:", ans)
                        else:
                            ans = ""
                            str1_list = word_tokenize(answer)
                            str2_list = word_tokenize(question[1].lower())
                            for word in str1_list:
                                if word.lower() not in str2_list and word.lower() not in punctuationSet:
                                   ans = ans+" "+word
                            print ("QuestionID:", question[0])
                            print ("Answer:", date)
                        break

                    if(q.lower() == 'what' or q.lower() == 'which'):
                        max_score_what = 0
                        answer = ""
                        for sent in text_list:
                            scoreOfASentence = wordMatch(
                                question[1], sent, storyPOS_dict)
                            scoreOfWhatRule = what_rule(
                                question[1], sent)
                            scoreOfWhatRule += scoreOfASentence
                            if (scoreOfWhatRule == max_score_what):
                                answer = answer + " | "+sent

                            if (max_score_what < scoreOfWhatRule):
                                max_score_what = scoreOfWhatRule
                                answer = sent

                        ans = ""
                        if "|" in answer:
                            str1_list = word_tokenize(answer.split("|")[1])
                        else:
                            str1_list = word_tokenize(answer)
                        str2_list = word_tokenize(question[1].lower())
                        for word in str1_list:
                            if word.lower() not in str2_list and word.lower() not in punctuationSet:
                               ans = ans+" "+word

                        if 'being' in question[0] and 'being' in ans:
                            ans = ans.split('being')[1]

                        print("QuestionID:", question[0])
                        print("Answer:", ans)
                        break

                    if(q.lower() == 'why'):
                        index = -1
                        max_score_why = 0
                        answer = ""
                        for sent in text_list:
                            index = index + 1
                            scoreOfWhyRule = why_rule(
                                sent, BESTlines, text_list, index)

                            if (scoreOfWhyRule == max_score_why):
                                answer = answer + " | "+sent

                            if (max_score_why < scoreOfWhyRule):
                                max_score_why = scoreOfWhyRule
                                answer = sent

                        ans = ""
                        str1_list = word_tokenize(answer)
                        str2_list = word_tokenize(question[1].lower())

                        if("|" in answer):
                            ans = answer.split("|")[-1]
                        else:
                            ans = answer

                        if 'being' in question[0] and 'being' in ans:
                            ans = ans.split('being')[1]
                        if 'because' in ans:
                            ans = ans[ans.index('because '):]

                        elif 'for ' in ans:
                            ans = ans[ans.index('for '):]

                        elif 'to ' in ans:
                            ans = ans[ans.index('to '):]

                        print("QuestionID:", question[0])
                        print("Answer:", ans)
                        break

                    if(q.lower() == 'how'):
                        max_score_how = 0
                        for sent in text_list:
                            scoreOfHowRule = how_rule(
                                sent, question[1], storyPOS_dict)
                            if (scoreOfHowRule == max_score_how):
                                answer = answer + " | "+sent

                            if (scoreOfHowRule > max_score_how):
                                max_score_how = scoreOfHowRule
                                answer = sent

                        ans = ""

                        if ("|" in answer):
                            str1_list = word_tokenize(answer.split("|")[1])
                        else:
                            str1_list = word_tokenize(answer)
                        str2_list = word_tokenize(question[1].lower())

                        print("QuestionID:", question[0])
                        tagAnswer = nltk.pos_tag(str1_list)
                        if "how many" in question[1].lower() or "how much" in question[1].lower():
                            for i in range(0, len(tagAnswer)):
                                if 'CD' in tagAnswer[i][1]:
                                    ans = ans + " " + tagAnswer[i][0]
                                else:
                                    ans = answer
                        else:
                            for word in str1_list:
                                if word.lower() not in str2_list and word.lower() not in punctuationSet:
                                   ans = ans+" "+word

                        if 'being' in question[0] and 'being' in ans:
                            ans = ans.split('being')[1]

                        if 'how' in question[0] and 'by ' in ans:
                            ans = ans[ans.index('by '):]

                        if ('how' in question[0]) and 'from ' in ans:
                            ans = ans[ans.index('from '):]
                        print("Answer:", ans)
                        break
            acc += matchForAccuracy(question[2], ans)            

        if not any(word in question[1].lower().split() for word in quest_words):
                max_score_else = 0
                for sent in text_list:
                    current_score = wordMatch(question[1], sent, storyPOS_dict)

                    if (current_score == max_score_else):
                          answer = answer + " | "+sent

                    if current_score > max_score_else:
                        max_score_else = current_score
                        answer = sent

                ans = ""
                if ("|" in answer):
                    str1_list = word_tokenize(answer.split("|")[1])
                else:
                    str1_list = word_tokenize(answer)
                str2_list = word_tokenize(question[1].lower())
                for word in str1_list:
                    if word.lower() not in str2_list and word.lower() not in punctuationSet:
                        ans = ans+" "+word

                if 'being' in question[0] and 'being' in ans:
                    ans = ans.split('being')[1]

                print("QuestionID:", question[0])
                print("Answer:", ans)
                acc += matchForAccuracy(question[2], ans)
        print("\n")

        total_accuracy += acc

    print("Total Accuracy: ", total_accuracy / len(questions_data) * 100)
    return total_accuracy / len(questions_data) * 100


In [122]:
semanticClasses("./")

In [131]:
input_path  = "./testset1/"
input_file = open(input_path+"testset1.txt")
input_data = input_file.read().splitlines()
print(input_data)

['1999-W07-5', '1999-W08-1', '1999-W08-5', '1999-W09-5', '1999-W20-4', '1999-W26-1', '1999-W27-3', '1999-W28-1', '1999-W29-4', '1999-W31-1', '1999-W31-3', '1999-W31-5', '1999-W32-4', '1999-W32-5', '1999-W33-3', '1999-W33-5', '1999-W34-4', '1999-W35-5', '1999-W37-2', '1999-W37-4', '1999-W38-1', '1999-W39-1', '1999-W39-3', '1999-W39-5', '1999-W41-3', '1999-W41-5', '1999-W43-5', '1999-W45-5', '1999-W47-5', '1999-W49-5', '1999-W51-1', '2000-W02-3', '2000-W02-5', '2000-W03-1', '2000-W03-2', '2000-W03-3', '2000-W05-4', '2000-W06-5', '2000-W07-5']


In [132]:
total_accuracy = 0
for i in range(0,len(input_data)):
    story = input_data[i] + ".story"
    questions = input_data[i] + ".questions"
    answers = input_data[i] + ".answers"
    answer_file = open(input_path+answers)
    answer_data = answer_file.read().splitlines()
    answers_total = list(filter(None, answer_data))

    questions_data = []
    for j in range(0, len(answers_total), 4):
        question_temp = []
        quesid = answers_total[j].split(":")[1].lstrip(" ")
        question_temp.append(quesid)
        ques = answers_total[j+1].split(":")[1].lstrip(" ")
        question_temp.append(ques)
        answer = answers_total[j+2].split(":")[1].lstrip(" ")
        question_temp.append(answer)
        questions_data.append(question_temp)
    para_dict = parseParagraph(input_path+story)
    total_accuracy += data_forward(questions_data, para_dict)

print("Total Accuracy: ", total_accuracy / (len(input_data)-1))



QuestionID: 1999-W07-5-1
Answer:  Shivering Alberta


QuestionID: 1999-W07-5-2
Answer:  But Kolla


QuestionID: 1999-W07-5-4
Answer:  Father Michael Kolla was at resort after a sleepless night he got word safe and sound


QuestionID: 1999-W07-5-5
Answer:  Shivering Alberta


QuestionID: 1999-W07-5-6
Answer:  Using their hands two dug into hollow around base of a tree covered floor with pine boughs and settled in for a long cold night


QuestionID: 1999-W07-5-7
Answer:  "We were so happy."


QuestionID: 1999-W07-5-8
Answer:  His Carol Sandpoint ``


QuestionID: 1999-W07-5-12
Answer:  `` They helped my live ''


Total Accuracy:  8.03125
QuestionID: 1999-W08-1-1
Answer:  The senators were split 50-50 on the charge of obstruction of justice and only 45 voted Clinton guilty on the charge of


QuestionID: 1999-W08-1-2
Answer: The senators were split 50-50 on the charge of obstruction of justice, and only 45 voted Clinton "guilty" on the charge of perjury.


QuestionID: 1999-W08-1-3
Answer:  