In [2]:
import numpy as np
import re
import os
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from math import log
import pandas as pd

In [None]:
# prepare tfidf index for corpus (ck12 text books of relevant topics downloaded from web)
# read data
# get question from every row
# get the closest paragraphs for the question
# get the best matched answer for the closest paragraph


In [2]:
# preprocess the line. 
def preprocess(line):
    #remove stop words, all symbols and numbers and split the line in to words
    line = re.sub("[^a-zA-Z]"," ",line)
    line = line.lower().split()
    
    #remove stopwords
    stops = stopwords.words("english")
    line = [word for word in line if not word in stops]
    
    #stemming
    stemmer = PorterStemmer()
    line = [stemmer.stem(word) for word in line]
    
    return line

In [48]:
# Get closest paragraphs from the corpus given a qst and tfidf of corpus
def get_closest_para_for_qst(qst, para_tf, idf):
    matched_para = []
    for para_name, para in para_tf.items():
        w_in_para_score = 0
        for word in qst:
            if word in para:
                w_in_para_score += 0.05 * para_tf[para_name][word] * idf[word]
        
        if w_in_para_score > 0:
            matched_para.append((para, w_in_para_score))
    
    #Get best matched para for the qst. Paragraph with highest score are the best matched ones with the qst
    matched_para = sorted(matched_para, key = lambda k: k[1], reverse = True)
    return matched_para[:5] #Return top 3 matched paragraphs

In [4]:
# Read the corpus - ck12 text books
path = '/Users/homw/Documents/petp/AllenAI/Concepts - CK-12 Foundation.txt'

In [26]:
vocab = set()
# Initialize a dictionary to keep {line1:{word1:tf, word2:tf...}...}

total_words = 0
para_tf = {}
for index, line in enumerate(open(path)):
    
    line = preprocess(line)
    
    if len(line) > 5:
        total_line_words = 0 #To keep the count of words in paragraph/line
        dic = {}
        for word in line:
            vocab.add(word)
            dic.setdefault(word,0) #add a word to dictionary only if it is not existing
            dic[word] = dic[word]+1
            total_words = total_words + 1
            total_line_words += 1

        # Compute term freq for each word in a paragraph
        #for word, count in dic.items():
            #dic[word] = count
        
        # store Tf values of each paragraph in a dictionary
        para_name = "para"+ str(index)
        para_tf[para_name] = dic

In [3]:
# Compute idf values for all the words in vocabulary
idf = {}
for word in list(vocab):
    docs_has_word = 1
    for index,doc in para_tf.items():
        if word in doc:
            docs_has_word += 1
    idf[word] = log(len(para_tf)/docs_has_word)

NameError: name 'vocab' is not defined

In [49]:
closest_paras = get_closest_para_for_qst(qst, para_tf, idf)

In [1]:
#read the data 
data = pd.read_csv("/Users/homw/Documents/petp/AllenAI/validation_set.tsv", sep = "\t")

NameError: name 'pd' is not defined

In [47]:
qst
data.question[0]
idf["respiration"]
data.question[0]

'When athletes begin to exercise, their heart rates and respiration rates increase.  At what level of organization does the human body coordinate these functions?'

In [None]:
qst = preprocess(data.question[0])
qst
closest_paras
#closest_paras

In [51]:
prediction = []
missed = 0
for index, record in data.iterrows():
    #print(index, record)
    qst = preprocess(record["question"])
    closest_paras = get_closest_para_for_qst(qst, para_tf, idf) #Get only the paragraph, score is not required
    # Now check which of the options out of A,B,C,D scores highest with the best matched paragraphs of the Qst
    
    opt_A = preprocess(record["answerA"])
    score_A = 0
    for word in opt_A:
        for para, score in list(closest_paras):
            if word in para:
                score_A += para[word] * idf[word]
                
    
    opt_B = preprocess(record["answerB"])
    score_B = 0
    for word in opt_B:
        for para, score in list(closest_paras):
            if word in para:
                score_B += para[word] * idf[word]

                
    
    opt_C = preprocess(record["answerC"])
    score_C = 0
    for word in opt_C:
        for para, score in list(closest_paras):
            if word in para:
                score_C += para[word] * idf[word]

                
    
    opt_D = preprocess(record["answerD"])
    score_D = 0
    for word in opt_D:
        for para, score in list(closest_paras):
            if word in para:
                score_D += para[word] * idf[word]

    if all([score_A,score_B,score_C,score_D]) == 0:
        prediction.append("N")
        missed += 1
    else:
        prediction.append(["A","B","C","D"] [np.argmax([score_A,score_B,score_C,score_D])])
    if len(prediction)%500 == 0:
        print(len(prediction), index) 

500 499
1000 999
1500 1499
2000 1999
2500 2499


In [73]:
len([1 for i, j in zip(prediction, data.correctAnswer) if i == j])

274

In [None]:
# remove items from dictionary - This part is not used
small_para = [k for k,p in para_tf.items() if len(p) <= 5]
for k in small_para: del para_tf[k]

In [64]:
data.columns

Index(['id', 'question', 'correctAnswer', 'answerA', 'answerB', 'answerC',
       'answerD'],
      dtype='object')

In [24]:
a

0.07142857142857142

In [356]:
pd.DataFrame({'id': list(data['id']), 'correctAnswer': prediction})[['id', 'correctAnswer']].to_csv("sub2.csv", index = False)

In [None]:
all([0,0,0])==0

In [362]:
sum(prediction == data["correctAnswer"])/2500

0.27879999999999999

In [369]:
a = 22
b = 4
check=min(a,b)
gcd=max(a,b)
while(check != 0):
     
    x = gcd % check
    gcd = check
    check = x
    
print(gcd)

2


In [4]:
line = "I am 2.5 inch shorter than the random guy x@# as678"
line = re.sub("[^a-zA-Z]"," ",line)
line

'I am     inch shorter than the random guy x   as   '

In [11]:
with open(path, "r") as jsonf:
    jsondoc = json.loads(unicode(jsonf.readlines(), "ISO-8859-1"))
jsonf.close()

TypeError: coercing to Unicode: need string or buffer, list found

In [4]:
import json
path = '/Users/homw/Documents/MSDS16/textmining/yelp/query/query.json'

In [30]:
def dot(a,b):
    return sum([(ai*bi) for ai,bi in zip(a,b)])
def cos_sim(a,b):
    return dot(a,b)/(np.sqrt(dot(a,a)) * np.sqrt(dot(b,b)))   

In [37]:

a = [1,2,3,4,4,78,4]
b = [0,0,0,0,0]
dot(a,b), dot(a,a), dot(b,b)


(0, 6146, 0)

In [38]:
cos_sim(a,b)



nan

In [29]:
[(ai*bi) for ai,bi in zip(a,b)]

[0, 10, 0, 0, 0, 0, 12]