In [None]:
import numpy as np
import re
import os
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from math import log
import pandas as pd

In [None]:
# prepare tfidf index for corpus (ck12 text books of relevant topics downloaded from web)
# read data
# get question from every row
# get the closest paragraphs for the question
# get the best matched answer for the closest paragraph


In [70]:
# preprocess the line. 
def preprocess(line):
    #remove stop words, all symbols and numbers and split the line in to words
    line = re.sub("[^a-zA-Z]"," ",line)
    line = line.lower().split()
    
    #remove stopwords
    stops = stopwords.words("english")
    line = [word for word in line if not word in stops]
    
    #stemming
    stemmer = PorterStemmer()
    line = [stemmer.stem(word) for word in line]
    
    return line

In [71]:
# Get closest paragraphs from the corpus given a qst and tfidf of corpus
def get_closest_para_for_qst(qst, para_tf, idf):
    matched_para = []
    for para_name, para in para_tf.items():
        w_in_para_score = 0
        for word in qst:
            if word in para:
                w_in_para_score += para_tf[para_name][word] * idf[word]
        
        if w_in_para_score > 0:
            matched_para.append((para, w_in_para_score))
    
    #Get best matched para for the qst. Paragraph with highest score are the best matched ones with the qst
    matched_para = sorted(matched_para, key = lambda k: k[1], reverse = True)
    return matched_para[:3] #Return top 3 matched paragraphs

In [None]:
# Read the corpus - ck12 text books
path = '/Users/homw/Documents/petp/AllenAI/wiki/'

In [None]:
vocab = set()
# Initialize a dictionary to keep {line1:{word1:tf, word2:tf...}...}

total_words = 0
para_tf = {}
num = 1
for fname in os.listdir(path):
    if fname.endswith(".txt"):
        #print(fname)
        file = os.path.join(path, fname)
        for index, line in enumerate(open(file)):

            line = preprocess(line)

            if len(line) > 5:
                total_line_words = 0 #To keep the count of words in paragraph/line
                dic = {}
                for word in line:
                    vocab.add(word)
                    dic.setdefault(word,0) #add a word to dictionary only if it is not existing
                    dic[word] = dic[word]+1
                    total_words = total_words + 1
                    total_line_words += 1

                # Compute term freq for each word in a paragraph
                for word, count in dic.items():
                    dic[word] = 0.5 + 0.5*(count/max(dic.values()))

                # store Tf values of each paragraph in a dictionary
                para_name = "para"+ str(num)
                para_tf[para_name] = dic
                num += 1

In [None]:
# Compute idf values for all the words in vocabulary
idf = {}
for word in list(vocab):
    docs_has_word = 1
    for index,doc in para_tf.items():
        if word in doc:
            docs_has_word += 1
    idf[word] = log(len(para_tf)/docs_has_word)

In [None]:
closest_paras = get_closest_para_for_qst(qst, para_tf, idf)

In [79]:
#read the data 
data = pd.read_csv("/Users/homw/Documents/petp/AllenAI/validation_set.tsv", sep = "\t")

In [None]:
qst = data.question[0]
qst = preprocess(qst)

In [69]:
len(vocab)

67611

In [91]:
prediction = []
missed = 0
for index, record in data.iterrows():
    #print(index, record)
    qst = preprocess(record["question"])
    closest_paras = get_closest_para_for_qst(qst, para_tf, idf) #Get only the paragraph, score is not required
    # Now check which of the options out of A,B,C,D scores highest with the best matched paragraphs of the Qst
    
    opt_A = preprocess(record["answerA"])
    score_A = 0
    for word in opt_A:
        for para, score in list(closest_paras):
            if word in para:
                score_A += para[word] * idf[word]
                
    
    opt_B = preprocess(record["answerB"])
    score_B = 0
    for word in opt_B:
        for para, score in list(closest_paras):
            if word in para:
                score_B += para[word] * idf[word]

                
    
    opt_C = preprocess(record["answerC"])
    score_C = 0
    for word in opt_C:
        for para, score in list(closest_paras):
            if word in para:
                score_C += para[word] * idf[word]

                
    
    opt_D = preprocess(record["answerD"])
    score_D = 0
    for word in opt_D:
        for para, score in list(closest_paras):
            if word in para:
                score_D += para[word] * idf[word]

    if all([score_A,score_B,score_C,score_D]) == 0:
        prediction.append("N")
        missed += 1
    else:
        prediction.append(["A","B","C","D"] [np.argmax([score_A,score_B,score_C,score_D])])
    if len(prediction)%500 == 0:
        print(len(prediction), index) 

500 499
1000 999
1500 1499
2000 1999
2500 2499
3000 2999
3500 3499
4000 3999
4500 4499
5000 4999
5500 5499
6000 5999
6500 6499
7000 6999
7500 7499
8000 7999


In [77]:
len([1 for i, j in zip(prediction, data.correctAnswer) if i == j])

640

In [89]:
prediction = ["A","N","C"]
len([1 for c in prediction if c == "N"])

1

In [93]:
#prediction = 
prediction = ["B" if c == "N" else c for c in prediction]


In [None]:
vocab = set()
# Initialize a dictionary to keep {line1:{word1:tf, word2:tf...}...}

total_words = 0
para_tf = {}
for index, line in enumerate(open("/Users/homw/Documents/petp/AllenAI/wiki/science_experiments.txt")):

    line = preprocess(line)

    if len(line) > 5:
        total_line_words = 0 #To keep the count of words in paragraph/line
        dic = {}
        for word in line:
            vocab.add(word)
            dic.setdefault(word,0) #add a word to dictionary only if it is not existing
            dic[word] = dic[word]+1
            total_words = total_words + 1
            total_line_words += 1

        # Compute term freq for each word in a paragraph
        #for word, count in dic.items():
            #dic[word] = count

        # store Tf values of each paragraph in a dictionary
        para_name = "para"+ str(index)
        para_tf[para_name] = dic

In [94]:
pd.DataFrame({'id': list(data['id']), 'correctAnswer': prediction})[['id', 'correctAnswer']].to_csv("sub2.csv", index = False)

In [None]:
all([0,0,0])==0

In [None]:
sum(prediction == data["correctAnswer"])/2500

In [None]:
para_tf

In [None]:
line = "I am 2.5 inch shorter than the random guy x@# as678"
line = re.sub("[^a-zA-Z]"," ",line)
line

In [119]:
d=0

probs = [np.exp(s) for s in scores]
probs = probs/sum(probs)
probs


array([  9.99954597e-01,   4.53978684e-05,   5.60254205e-09])

In [118]:
scores=[20,10,1]

In [111]:
x = np.arange(-2.0, 6.0, 0.1)
scores = np.vstack([x, np.ones_like(x), 0.2 * np.ones_like(x)])
scores

array([[ -2.00000000e+00,  -1.90000000e+00,  -1.80000000e+00,
         -1.70000000e+00,  -1.60000000e+00,  -1.50000000e+00,
         -1.40000000e+00,  -1.30000000e+00,  -1.20000000e+00,
         -1.10000000e+00,  -1.00000000e+00,  -9.00000000e-01,
         -8.00000000e-01,  -7.00000000e-01,  -6.00000000e-01,
         -5.00000000e-01,  -4.00000000e-01,  -3.00000000e-01,
         -2.00000000e-01,  -1.00000000e-01,   1.77635684e-15,
          1.00000000e-01,   2.00000000e-01,   3.00000000e-01,
          4.00000000e-01,   5.00000000e-01,   6.00000000e-01,
          7.00000000e-01,   8.00000000e-01,   9.00000000e-01,
          1.00000000e+00,   1.10000000e+00,   1.20000000e+00,
          1.30000000e+00,   1.40000000e+00,   1.50000000e+00,
          1.60000000e+00,   1.70000000e+00,   1.80000000e+00,
          1.90000000e+00,   2.00000000e+00,   2.10000000e+00,
          2.20000000e+00,   2.30000000e+00,   2.40000000e+00,
          2.50000000e+00,   2.60000000e+00,   2.70000000e+00,
        