In [1]:
import re
import json
import string
import random
from nltk import pos_tag
from nltk import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import sys
sys.path.insert(0, '../SVM')
from encoder_functions import *

In [2]:
def get_corpus(filepath = '../SVM/data/data.json'):
    corpus = []
    with open(filepath) as file:
        data = json.load(file)
    for problem in data:
        corpus.append(problem['question'])
    return corpus

### POS Tagging
NN	  noun, singular 'desk'  
NNS	  noun plural	'desks'  
NNP	  proper noun, singular	'Harrison'  
NNPS  proper noun, plural	'Americans'  

In [3]:
# http://lbcrs.org/common/pages/DisplayFile.aspx%3FitemId%3D3446744
# https://www.purplemath.com/modules/translat.htm
math_terms = [
    # Addition Words
    'add',
    'all',
    'together',
    'altogether',
    'and',
    'both',
    'combined',
    'much',
    'increase',
    'increased',
    'by',
    'plus',
    'sum',
    'total',
    'added',
    'addition',
    # Subtraction words
    'change',
    'decrease',
    'decreased',
    'difference',
    'fewer',
    'left',
    'many',
    'more',
    'longer',
    'shorter',
    'taller', 
    'heavier', 
    'less',
    'lost',
    'minus',
    'need',
    'reduce',
    'remain',
    'subtract',
    'subtraction',
    'take' ,
    'away',
    'over',
    'after',
    'save',
    'comparative',
    # Multiplication words
    'double',
    'each' ,
    'group',
    'every',
    'factor', 
    'multiplied',
    'of',
    'product',
    'times',
    'triple',
    'twice',
    'multiplication',
    'multiply',
    # Division Words
    'cut',
    'share',
    'half',
    'fraction',
    'parts',
    'per',
    'percent',
    'quotient',
    'ratio',
    'separated',
    'equally',
    'divide',
    'division',
    'equal',
    'pieces', 
    'split',
    'average',
    # Equality Words
    'is', 
    'are', 
    'was', 
    'were', 
    'will',
    'gives', 
    'yields',
    'sold',
    'cost',
]

In [4]:
def replace_nouns2(corpus, window_size = 1):
    for j in range(len(corpus)):
        count = window_size + 1
        pos = pos_tag(word_tokenize(corpus[j]))
        for i in range(len(pos)):
            
            try:
                float(pos[i][0])
                count = 0
                continue
            except ValueError:
                count += 1
                
            if (pos[i][1] == 'NNP' or pos[i][1] == 'NNPS') and i > 0:
                corpus[j] = str.replace(corpus[j], pos[i][0], 'PN')
                continue
            
            if count > window_size:
                continue;
                
            if pos[i][0].lower() in math_terms:
                continue
            
            if pos[i][1] == 'NN' or pos[i][1] == 'NNS':
                corpus[j] = str.replace(corpus[j], pos[i][0], 'CN')
    return corpus

def replace_nouns(corpus):
    for j in range(len(corpus)):
        pos = pos_tag(word_tokenize(corpus[j]))
        for i in range(len(pos)):
            if pos[i][0].lower() in math_terms:
                continue
            if pos[i][1] == 'NN' or pos[i][1] == 'NNS':
                corpus[j] = str.replace(corpus[j], pos[i][0], 'CN')
            elif pos[i][1] == 'NNP' or pos[i][1] == 'NNPS':
                corpus[j] = str.replace(corpus[j], pos[i][0], 'PN')
    return corpus

replace_nouns(['John has 4 cards. How many cards does John have if he gets 3 more cards?'])

['PN has 4 CN. How many CN does PN have if he gets 3 more CN?']

In [5]:
def prep_equation_list(filepath, seed = 1):
    equations_list = []

    with open(filepath) as file:
        data = json.load(file)

    for datapoint in data:
        words = datapoint['question'].split(' ')
        words = removeEmptiesAndPunctuation(words)
        wordsAndEquations = replaceNumbers(words, datapoint['equations'], datapoint['unknowns'])

        words = wordsAndEquations[0]
        eqTemplates = wordsAndEquations[1]
        equations_list.append(eqTemplates)
    return equations_list

In [6]:
def redundant_equation_remover(corpus, equations_list, min_app = 2):
    equations_dict = dict()
    for equation in equations_list:
        if equation in equations_dict:
            equations_dict[equation] += 1
        else:
            equations_dict[equation] = 1
    for i in range(len(equations_list) - 1, -1, -1):
        if equations_dict[equations_list[i]] < min_app:
            del equations_list[i]
            del corpus[i]
    return corpus, equations_list

In [7]:
def get_tfidf_matrix(corpus):
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(corpus)
    return tf, tfidf_matrix

In [8]:
# Find documents similar to another document in the tfidf_matrix at given index
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

# Find documents similar to a document that is not in the tfidf_matrix
def find_similar_new(tfidf_matrix, new_doc, top_n = 2):
    cosine_similarities = linear_kernel(new_doc[0], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1]]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [9]:
def print_similar_questions(corpus, tf, tfidf_matrix, min_score):
    count = 0
    for i in range(len(corpus)):
        for index, score in find_similar(tfidf_matrix, i, top_n=1):
            if score > 0.5:
                count += 1
                print('Question', i)
                print(corpus[i])
                print('Similarity Score: ', score)
                print('Similar Question: ', corpus[index], '\n')
    print('Number of problems with score above', min_score, ': ', count)
    print('Total number of problems in corpus: ', len(corpus))
    print('Fraction of entire corpus: ', count/len(corpus))

In [10]:
def get_input_and_similar_questions(corpus, equations_list, tf, tfidf_matrix, min_score):
    while True:
        question = input("Type in a question: ")
        temp = question.split(" ")
        temp = removeEmptiesAndPunctuation(temp)
        if question.lower() == 'exit':
            print('Exitting...\n')
            break
        
        numbers = findNumbersInWords(temp)
        question = replace_nouns([question])[0]
        print(question)
        new_doc = tf.transform([question])
        
        template_found = False
        
        for index, score in find_similar_new(tfidf_matrix, new_doc, top_n=15):
            if score > min_score:
                similar_question = corpus[index]
                similar_question = similar_question.split(" ")
                similar_question = removeEmptiesAndPunctuation(similar_question)
                numbers_in_similar_question = findNumbersInWords(similar_question)
                if len(numbers) == len(numbers_in_similar_question):
                    print(len(numbers_in_similar_question))
                    print(numbers_in_similar_question)
                    print(numbers)
                    template_found = True
                    equation = equations_list[index]
                    for i in range(len(numbers)):
                        equation = equation.replace("a" + str(i), str(numbers[i]))
                    print(equation)
                    print('-------------------------------------')
                    print('Similarity Score: ', score)
                    print('Similar Question: ', corpus[index], '\n\n')
                    break
        if not template_found:
            print('No similar questions found!\n')

In [11]:
def get_similar_questions(X_train, y_train, X_test, equation_values, tf, tfidf_matrix, min_score):
    y_pred = []
    for question in X_test:
        temp = question.split(" ")
        temp = removeEmptiesAndPunctuation(temp)

        numbers = findNumbersInWords(temp)
        question = replace_nouns([question])[0]
        new_doc = tf.transform([question])

        template_found = False

        for index, score in find_similar_new(tfidf_matrix, new_doc, top_n=10):
            if score > min_score:
                similar_question = X_train[index]
                similar_question = similar_question.split(" ")
                similar_question = removeEmptiesAndPunctuation(similar_question)
                numbers_in_similar_question = findNumbersInWords(similar_question)
                if len(numbers) == len(numbers_in_similar_question):
                    template_found = True
                    y_pred.append(equation_values[index])
                    break
        if not template_found:
            y_pred.append(0)
#             print('No similar questions found!\n')
    return y_pred

In [38]:
def get_similar_questions_with_min_sim(X_train, y_train, X_test, tf, tfidf_matrix, min_score):
    global num_non_sim_ques
    y_pred = []
    non_sim_index = []
    non_sim_ques = []
    for i in range(len(X_test)):
        question = str(X_test[i])
        temp = question.split(" ")
        temp = removeEmptiesAndPunctuation(temp)

        numbers = findNumbersInWords(temp)
        question = replace_nouns([question])[0]
        new_doc = tf.transform([question])

        template_found = False

        for index, score in find_similar_new(tfidf_matrix, new_doc, top_n=2):
            if score > min_score:
                similar_question = X_train[index]
                similar_question = similar_question.split(" ")
                similar_question = removeEmptiesAndPunctuation(similar_question)
                numbers_in_similar_question = findNumbersInWords(similar_question)
                if len(numbers) == len(numbers_in_similar_question):
                    template_found = True
                    y_pred.append(y_train[index])
                    break
        if not template_found:
            non_sim_index.append(i)
            y_pred.append(0)
            num_non_sim_ques += 1
#             print('No similar questions found!\n')
    return y_pred, non_sim_index

In [40]:
def user_run():
    min_score = 0.9
    filepath = '../SVM/data/data.json'
    
    fp = open('../SVM/data/equations.json')
    equation_dict = json.load(fp)
    
    # Preparing Corpus
    corpus = get_corpus(filepath)
    corpus = replace_nouns(corpus)
    
    # Preparing Equations
    equations_list = prep_equation_list(filepath)
    
    eqns_in_dict = [i for i, x in enumerate(equations_list) if x in equation_dict.keys() ]
    corpus = [corpus[i] for i in eqns_in_dict]
    equations_list = [equations_list[i] for i in eqns_in_dict]
    
    equation_values = list()
    for i in range(len(equations_list)):
        equation_values.append(equation_dict[equations_list[i]])
    
    X_train, X_test, y_train, y_test = train_test_split(corpus, equation_values, test_size = 0.25, random_state=1)
    
#     corpus, equations_list = redundant_equation_remover(corpus, equations_list, min_app = 2)
    tf, tfidf_matrix = get_tfidf_matrix(X_train)
    
#     get_input_and_similar_questions(corpus, equations_list, tf, tfidf_matrix, min_score)
#     print_similar_questions(corpus, tf, tfidf_matrix, min_score)

    print(len(X_train))
#     y_pred = get_similar_questions(corpus, equation_values, corpus[:], equation_values, tf, tfidf_matrix, min_score)
    y_pred, non_sim_index = get_similar_questions_with_min_sim(X_train, y_train, X_test, tf, tfidf_matrix, min_score)
    print(len(y_pred))
    print(len(y_test))
    
    total_ques = len(y_test)
    total_acc = accuracy_score(y_pred, y_test)
    print('Number of Non-Similar Question: ', num_non_sim_ques)
    
    print('Total Number of Questions: ', total_ques)
    print('Accuracy: ', total_acc)
    num_sim = total_ques - num_non_sim_ques
    print('Number of Questions w/ min similarity: ', num_sim, num_sim/total_ques)
    acc_min_sim = (total_acc*total_ques)/(total_ques - num_non_sim_ques)
    print('Accuracy w/ min_similarity: ', acc_min_sim)
    print('Number of question correct with min_similarity: ', acc_min_sim * (total_ques - num_non_sim_ques))
    
    
#     print('Accuracy: ', accuracy_score(equation_values, y_pred))
    
#     for i in range(len(equation_values)):
#         print(equation_values[i], '\t\t', y_pred[i])

In [41]:
num_non_sim_ques = 0
user_run()

9783
3262
3262
Number of Non-Similar Question:  3213
Total Number of Questions:  3262
Accuracy:  0.0006131207847946045
Number of Questions w/ min similarity:  49 0.015021459227467811
Accuracy w/ min_similarity:  0.04081632653061224
Number of question correct with min_similarity:  1.9999999999999998
