In [1]:
import re
import json
import pickle
import random
import string
import pandas as pd
from nltk import pos_tag
from nltk import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

import sys
sys.path.insert(0, '../SVM')
from encoder_functions import *

In [2]:
def retrieve_data(flag=0):
    path = '../CNN_model/data'
    if flag == 0:
        # train_ds = pd.read_csv('data/new_train.csv', sep=',', encoding = "ISO-8859-1")
        train_ds = pd.read_csv('../CNN_model/data/train_synthetic.csv', sep=',', encoding = "ISO-8859-1")
        test_ds = pd.read_csv('../CNN_model/data/test.csv', sep=',')

        X_train = list(train_ds['question'].values)
        y_train = list(train_ds['operation'].values)

        X_test = list(test_ds['question'].values)
        y_test = list(test_ds['operation'].values)

        return X_train, y_train, X_test, y_test
    elif flag == 1:
        df = pd.read_csv('../CNN_model/data/formatted_singleop.csv', sep=',', encoding = "ISO-8859-1")
        train_size = int(len(df) * .8)

        questions = list(df['question'].values)
        ops = list(df['operation'].values)

        combined = list(zip(questions, ops))
        random.shuffle(combined)

        X_train = [itx[0] for itx in combined[:train_size]]
        y_train = [itx[1] for itx in combined[:train_size]]

        X_test = [itx[0] for itx in combined[train_size:]]
        y_test = [itx[1] for itx in combined[train_size:]]

        return X_train, y_train, X_test, y_test
    elif flag == 2:
        data_set = None
        with open('../CNN_model/data/singleop_shuffled_num_replaced.pickle', 'rb') as handle:
            data_set = pickle.load(handle)

        train_size = int(len(data_set) * .7)

        X_train = [itx[0] for itx in data_set[:train_size]]
        y_train = [itx[1] for itx in data_set[:train_size]]

        X_test = [itx[0] for itx in data_set[train_size:]]
        y_test = [itx[1] for itx in data_set[train_size:]]

        return X_train, y_train, X_test, y_test
    if flag == 3:
        # data_set = None
        # with open('data/singleop_shuffled_num_replaced.pickle', 'rb') as handle:
        #     data_set = pickle.load(handle)
        #
        # X_train = [itx[0] for itx in data_set]
        # y_train = [itx[1] for itx in data_set]

        train_ds = pd.read_csv('../CNN_model/data/train_synthetic.csv', sep=',', encoding = "ISO-8859-1")

        X_train = list(train_ds['question'].values)
        y_train = list(train_ds['operation'].values)

        test_ds = pd.read_csv('../CNN_model/data/iit_test.csv', sep=',', encoding = "ISO-8859-1")

        X_test = list(test_ds['question'].values)
        y_test = list(test_ds['operation'].values)

        return X_train, y_train, X_test, y_test

### POS Tagging
NN	  noun, singular 'desk'  
NNS	  noun plural	'desks'  
NNP	  proper noun, singular	'Harrison'  
NNPS  proper noun, plural	'Americans'  

In [3]:
# http://lbcrs.org/common/pages/DisplayFile.aspx%3FitemId%3D3446744
# https://www.purplemath.com/modules/translat.htm
math_terms = [
    # Addition Words
    'add',
    'all',
    'together',
    'altogether',
    'and',
    'both',
    'combined',
    'much',
    'increase',
    'increased',
    'by',
    'plus',
    'sum',
    'total',
    'added',
    'addition',
    # Subtraction words
    'change',
    'decrease',
    'decreased',
    'difference',
    'fewer',
    'left',
    'many',
    'more',
    'longer',
    'shorter',
    'taller', 
    'heavier', 
    'less',
    'lost',
    'minus',
    'need',
    'reduce',
    'remain',
    'subtract',
    'subtraction',
    'take' ,
    'away',
    'over',
    'after',
    'save',
    'comparative',
    # Multiplication words
    'double',
    'each' ,
    'group',
    'every',
    'factor', 
    'multiplied',
    'of',
    'product',
    'times',
    'triple',
    'twice',
    'multiplication',
    'multiply',
    # Division Words
    'cut',
    'share',
    'half',
    'fraction',
    'parts',
    'per',
    'percent',
    'quotient',
    'ratio',
    'separated',
    'equally',
    'divide',
    'division',
    'equal',
    'pieces', 
    'split',
    'average',
    # Equality Words
    'is', 
    'are', 
    'was', 
    'were', 
    'will',
    'gives', 
    'yields',
    'sold',
    'cost',
]

In [4]:
def replace_nouns(corpus):
    for j in range(len(corpus)):
        pos = pos_tag(word_tokenize(corpus[j]))
        for i in range(len(pos)):
            if pos[i][0].lower() in math_terms:
                continue
            if pos[i][1] == 'NN' or pos[i][1] == 'NNS':
                corpus[j] = str.replace(corpus[j], pos[i][0], 'commonnoun')
            elif pos[i][1] == 'NNP' or pos[i][1] == 'NNPS':
                corpus[j] = str.replace(corpus[j], pos[i][0], 'propernoun')
    return corpus

# def replace_nouns(corpus, window_size = 1):
#     for j in range(len(corpus)):
#         count = window_size + 1
#         pos = pos_tag(word_tokenize(corpus[j]))
#         for i in range(len(pos)):
            
#             try:
#                 float(pos[i][0])
#                 count = 0
#                 continue
#             except ValueError:
#                 count += 1
                
#             if (pos[i][1] == 'NNP' or pos[i][1] == 'NNPS') and i > 0:
#                 corpus[j] = str.replace(corpus[j], pos[i][0], 'propernoun')
#                 continue
            
#             if count > window_size:
#                 continue;
                
#             if pos[i][0].lower() in math_terms:
#                 continue
            
#             if pos[i][1] == 'NN' or pos[i][1] == 'NNS':
#                 corpus[j] = str.replace(corpus[j], pos[i][0], 'commonnoun')
#     return corpus

In [5]:
def get_tfidf_matrix(corpus):
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(corpus)
    return tf, tfidf_matrix

In [6]:
# Find documents similar to another document in the tfidf_matrix at given index
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

# Find documents similar to a document that is not in the tfidf_matrix
def find_similar_new(tfidf_matrix, new_doc, top_n = 2):
    cosine_similarities = linear_kernel(new_doc[0], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1]]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [7]:
def get_input_and_similar_questions(X_train, y_train, X_test, tf, tfidf_matrix, min_score):
    y_pred = []
    for question in X_test:
        temp = question.split(" ")
        temp = removeEmptiesAndPunctuation(temp)

        numbers = findNumbersInWords(temp)
        question = replace_nouns([question])[0]
        new_doc = tf.transform([question])

        template_found = False

        for index, score in find_similar_new(tfidf_matrix, new_doc, top_n=15):
            if score > min_score:
                similar_question = X_train[index]
                similar_question = similar_question.split(" ")
                similar_question = removeEmptiesAndPunctuation(similar_question)
                numbers_in_similar_question = findNumbersInWords(similar_question)
                if len(numbers) == len(numbers_in_similar_question):
                    template_found = True
                    y_pred.append(y_train[index])
                    break
        if not template_found:
            y_pred.append('addition')
#             print('No similar questions found!\n')
    return y_pred

In [8]:
def user_run(flag):
    min_score = 0.0
    
    X_train, y_train, X_test, y_test = retrieve_data(flag)
    
    X_train = replace_nouns(X_train)
    tf, tfidf_matrix = get_tfidf_matrix(X_train)
    y_pred = get_input_and_similar_questions(X_train, y_train, X_test, tf, tfidf_matrix, min_score)
    print('Accuracy: ', accuracy_score(y_pred, y_test))

In [9]:
user_run(flag = 0)
user_run(flag = 1)
user_run(flag = 2)
user_run(flag = 3)

Accuracy:  0.9375
Accuracy:  0.6991150442477876
Accuracy:  0.7218934911242604
Accuracy:  0.006864988558352402
