In [1]:
import pandas as pd
import numpy as np
import re
from unidecode import unidecode
import nltk
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
class WordEncoder:
    def __init__(self):
        self.encoding_map = {}
        self.decoding_map = {}
        self.current_index = 43
    
    def encode_word(self, word):
        if word not in self.encoding_map:
            self.encoding_map[word] = self.current_index
            self.decoding_map[self.current_index] = word
            self.current_index += 1
        return self.encoding_map[word]
    
    def decode_word(self, encoded_word):
        if encoded_word in self.decoding_map:
            return self.decoding_map[encoded_word]
        else:
            raise ValueError("Invalid encoded word.")

In [3]:
encoder = WordEncoder()

In [4]:
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()


In [5]:
df = pd.read_csv('translated_combined.csv').reset_index()[['Question','Equation','Type']].fillna("General")
df

Unnamed: 0,Question,Equation,Type
0,Bryan also read his book.If there are number0 ...,* number0 number1,General
1,"For the fifth grade, the chair is placed in nu...",* number0 number1,General
2,The park currently has number0 short trees and...,+ number0 number2,General
3,Conner has $ number0 in his bank account.He sp...,- number0 * number1 number2,General
4,There are currently number0 wooden trees in th...,+ number0 number1,General
...,...,...,...
4815,how many hours are there in number0 seconds ?,/ / number0 60 60,Common
4816,how many seconds are there in number0 minutes ?,* number0 60,Common
4817,how many minutes are there in number0 seconds ?,/ number0 60,Common
4818,how many minutes are there in number0 hours ?,* number0 60,Common


In [6]:
df.Type.unique()

array(['General', 'Addition', 'Subtraction', 'Sum', 'TVQ-Change',
       'Multiplication', 'Common-Division', 'TVQ-Final', 'TVQ-Initial',
       'Ceil-Division', 'Floor-Division', 'Difference', 'Common'],
      dtype=object)

In [7]:
level1DataFrame = df[['Question','Type']]
level1DataFrame

Unnamed: 0,Question,Type
0,Bryan also read his book.If there are number0 ...,General
1,"For the fifth grade, the chair is placed in nu...",General
2,The park currently has number0 short trees and...,General
3,Conner has $ number0 in his bank account.He sp...,General
4,There are currently number0 wooden trees in th...,General
...,...,...
4815,how many hours are there in number0 seconds ?,Common
4816,how many seconds are there in number0 minutes ?,Common
4817,how many minutes are there in number0 seconds ?,Common
4818,how many minutes are there in number0 hours ?,Common


In [8]:
def l1PreProcessingX(text):
        text = text.replace(".", " ")
        text = text.replace(",", "")
        doc = nlp(text.lower())
        tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))

        verbs_raw = tagged_words[tagged_words[:, 1] == 'VERB'][:, 0]
        verbs_raw = list(dict.fromkeys([wordnet_lemmatizer.lemmatize(i,pos='v') for i in verbs_raw]))
        encoded_verbs = [encoder.encode_word(i) for i in verbs_raw]
        if len(encoded_verbs) < 10:
                encoded_verbs = np.pad(encoded_verbs, (0, 10 - len(encoded_verbs)))

        
        
        adj_raw = tagged_words[tagged_words[:, 1] == 'ADJ'][:, 0]
        encoded_adjs = [encoder.encode_word(i) for i in adj_raw]
        if len(encoded_adjs) < 5:
                encoded_adjs = np.pad(encoded_adjs, (0, 5 - len(encoded_adjs)))

        adv_raw = tagged_words[tagged_words[:, 1] == 'ADV'][:, 0]
        encoded_advs = [encoder.encode_word(i) for i in adv_raw]
        if len(encoded_advs) < 5:
                encoded_advs = np.pad(encoded_advs, (0, 5 - len(encoded_advs)))

        aux_raw = tagged_words[tagged_words[:, 1] == 'AUX'][:, 0]
        encoded_auxs = [encoder.encode_word(i) for i in aux_raw]
        if len(encoded_auxs) < 5:
                encoded_auxs = np.pad(encoded_auxs, (0, 5 - len(encoded_auxs)))


        return np.array(list(encoded_verbs[:10]) + list(encoded_adjs[:5]) + list(encoded_advs[:5]) + list(encoded_auxs[:5]))


In [9]:
l1X = [l1PreProcessingX(i) for i in list(level1DataFrame['Question'].values)]

In [10]:
l1Y = [encoder.encode_word(i) for i in list(level1DataFrame['Type'].values)]

In [11]:
from sklearn.ensemble import RandomForestClassifier
l1model = RandomForestClassifier()
l1model.fit(l1X,l1Y)

In [12]:
def predictType(text) :
    return(encoder.decode_word(l1model.predict([l1PreProcessingX(text)])[0]))

In [13]:
dfGeneral = df[df.Type=="General"][["Question","Equation"]]
dfAddition = df[df.Type=="Addition"][["Question","Equation"]]
dfSubtraction = df[df.Type=="Subtraction"][["Question","Equation"]]
dfSum = df[df.Type=="Sum"][["Question","Equation"]]
dfTVQChange = df[df.Type=="TVQ-Change"][["Question","Equation"]]
dfMultiplication = df[df.Type=="Multiplication"][["Question","Equation"]]
dfCommonDivision = df[df.Type=="Common-Division"][["Question","Equation"]]
dfTVQFinal = df[df.Type=="TVQ-Final"][["Question","Equation"]]
dfTVQInitial = df[df.Type=="TVQ-Initial"][["Question","Equation"]]
dfCeilDivision = df[df.Type=="CeilDivision"][["Question","Equation"]]
dfFloorDivision = df[df.Type=="FloorDivision"][["Question","Equation"]]
dfDifference = df[df.Type=="Difference"][["Question","Equation"]]
dfCommon = df[df.Type=="Common"][["Question","Equation"]]



In [14]:
def l2PreProcessingX(text) :
    text = text.replace(".", "")
    text = text.replace(",", "")
    doc = nlp(text.lower())
    tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))
    verbs = []
    advs = []
    adjs = []
    nouns = []
    nums = []
    auxs = []
    for i in tagged_words :
        
        if re.search('^number', i[0]) :
            nums.append(i[0])

        else :
            if i[1]== "VERB":
                verbs.append(i[0])
            if i[1]== "ADV":
                advs.append(i[0])
            if i[1]== "ADJ":
                adjs.append(i[0])
            if i[1]== "AUX":
                auxs.append(i[0])
            if i[1]== "NOUN":
                nouns.append(i[0])



    verbs = list(dict.fromkeys([wordnet_lemmatizer.lemmatize(i,pos='v') for i in verbs]))

    # print(verbs,advs,adjs,auxs,nums,nouns)

    encoded_verbs = [encoder.encode_word(i) for i in verbs]
    if len(encoded_verbs) < 5:
            encoded_verbs = np.pad(encoded_verbs, (0, 5 - len(encoded_verbs)))

    encoded_adjs = [encoder.encode_word(i) for i in adjs]
    if len(encoded_adjs) < 5:
            encoded_adjs = np.pad(encoded_adjs, (0, 5 - len(encoded_adjs)))

    encoded_advs = [encoder.encode_word(i) for i in advs]
    if len(encoded_advs) < 5:
            encoded_advs = np.pad(encoded_advs, (0, 5 - len(encoded_advs)))

    encoded_auxs = [encoder.encode_word(i) for i in auxs]
    if len(encoded_auxs) < 5:
            encoded_auxs = np.pad(encoded_auxs, (0, 5 - len(encoded_auxs)))

    encoded_nums = [encoder.encode_word(i) for i in nums]
    if len(encoded_nums) < 5:
            encoded_nums = np.pad(encoded_nums, (0, 5 - len(encoded_nums)))

    encoded_nouns = [encoder.encode_word(i) for i in nouns]
    if len(encoded_nouns) < 10:
            encoded_nouns = np.pad(encoded_nouns, (0, 10 - len(encoded_nouns)))
    encoded_nouns = []
    return np.hstack((encoded_verbs[:5] , encoded_advs[:5] , encoded_adjs[:5] , encoded_auxs[:5] , encoded_nums[:5] , encoded_nouns[:10]))

In [15]:
def l2PreProcessingY(text) :
    formula = text.split(" ")
    encoded_formula = [encoder.encode_word(i) for i in formula]
    if len(encoded_formula) < 10:
            encoded_formula = np.pad(encoded_formula, (0, 10 - len(encoded_formula)))
    return np.array(list(encoded_formula[:10]))

In [16]:
l2generalX = [l2PreProcessingX(i) for i in dfGeneral.Question.values]
l2generalY = [l2PreProcessingY(i) for i in dfGeneral.Equation.values]

In [17]:
l2GeneralModel = RandomForestClassifier()
l2GeneralModel.fit(l2generalX,l2generalY)

In [18]:
l2AdditionX = [l2PreProcessingX(i) for i in dfAddition.Question.values]
l2AdditionY = [l2PreProcessingY(i) for i in dfAddition.Equation.values]
l2AdditionModel = RandomForestClassifier()
l2AdditionModel.fit(l2AdditionX,l2AdditionY)

In [19]:
l2SubtractionX = [l2PreProcessingX(i) for i in dfSubtraction.Question.values]
l2SubtractionY = [l2PreProcessingY(i) for i in dfSubtraction.Equation.values]
l2SubtractionModel = RandomForestClassifier()
l2SubtractionModel.fit(l2SubtractionX,l2SubtractionY)

In [20]:
l2MultiplicationX = [l2PreProcessingX(i) for i in dfMultiplication.Question.values]
l2MultiplicationY = [l2PreProcessingY(i) for i in dfMultiplication.Equation.values]
l2MultiplicationModel = RandomForestClassifier()
l2MultiplicationModel.fit(l2MultiplicationX,l2MultiplicationY)

In [21]:
def l2PreProcessingXCommon(text) :
    text = text.replace(".", "")
    text = text.replace(",", "")
    doc = nlp(text.lower())
    tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))
    verbs = []
    advs = []
    adjs = []
    nouns = []
    nums = []
    auxs = []
    for i in tagged_words :
        
        if re.search('^number', i[0]) :
            nums.append(i[0])

        else :
            if i[1]== "VERB":
                verbs.append(i[0])
            if i[1]== "ADV":
                advs.append(i[0])
            if i[1]== "ADJ":
                adjs.append(i[0])
            if i[1]== "AUX":
                auxs.append(i[0])
            if i[1]== "NOUN":
                nouns.append(i[0])



    verbs = list(dict.fromkeys([wordnet_lemmatizer.lemmatize(i,pos='v') for i in verbs]))

    # print(verbs,advs,adjs,auxs,nums,nouns)

    encoded_verbs = [encoder.encode_word(i) for i in verbs]
    if len(encoded_verbs) < 5:
            encoded_verbs = np.pad(encoded_verbs, (0, 5 - len(encoded_verbs)))

    encoded_adjs = [encoder.encode_word(i) for i in adjs]
    if len(encoded_adjs) < 5:
            encoded_adjs = np.pad(encoded_adjs, (0, 5 - len(encoded_adjs)))

    encoded_advs = [encoder.encode_word(i) for i in advs]
    if len(encoded_advs) < 5:
            encoded_advs = np.pad(encoded_advs, (0, 5 - len(encoded_advs)))

    encoded_auxs = [encoder.encode_word(i) for i in auxs]
    if len(encoded_auxs) < 5:
            encoded_auxs = np.pad(encoded_auxs, (0, 5 - len(encoded_auxs)))

    encoded_nums = [encoder.encode_word(i) for i in nums]
    if len(encoded_nums) < 5:
            encoded_nums = np.pad(encoded_nums, (0, 5 - len(encoded_nums)))

    encoded_nouns = [encoder.encode_word(i) for i in nouns]
    if len(encoded_nouns) < 10:
            encoded_nouns = np.pad(encoded_nouns, (0, 10 - len(encoded_nouns)))
    encoded_verbs = []
    return np.hstack((encoded_verbs[:5] , encoded_advs[:5] , encoded_adjs[:5] , encoded_auxs[:5] , encoded_nums[:5] , encoded_nouns[:5]))

In [22]:
def predictionPreprocessingCommon(text) :
    text = text.replace(".", "")
    text = text.replace(",", "")
    doc = nlp(text.lower())
    tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))
    advs = []
    adjs = []
    nouns = []
    nums = []
    auxs = []
    count=0
    for i in tagged_words :
        if i[1]== "ADV":
            advs.append(i[0])
        if i[1]== "ADJ":
            adjs.append(i[0])
        if i[1]== "AUX":
            auxs.append(i[0])
        if i[1]== "NOUN":
            nouns.append(i[0])
        if i[1]== "NUM" and i[0].isnumeric() :
              nums.append("number{}".format(count))
              count+=1

    encoded_adjs = [encoder.encode_word(i) for i in adjs]
    if len(encoded_adjs) < 5:
            encoded_adjs = np.pad(encoded_adjs, (0, 5 - len(encoded_adjs)))

    encoded_advs = [encoder.encode_word(i) for i in advs]
    if len(encoded_advs) < 5:
            encoded_advs = np.pad(encoded_advs, (0, 5 - len(encoded_advs)))

    encoded_auxs = [encoder.encode_word(i) for i in auxs]
    if len(encoded_auxs) < 5:
            encoded_auxs = np.pad(encoded_auxs, (0, 5 - len(encoded_auxs)))

    encoded_nums = [encoder.encode_word(i) for i in nums]
    if len(encoded_nums) < 5:
            encoded_nums = np.pad(encoded_nums, (0, 5 - len(encoded_nums)))

    encoded_nouns = [encoder.encode_word(i) for i in nouns]
    if len(encoded_nouns) < 10:
            encoded_nouns = np.pad(encoded_nouns, (0, 10 - len(encoded_nouns)))
    return np.hstack((encoded_advs[:5] , encoded_adjs[:5] , encoded_auxs[:5] , encoded_nums[:5] , encoded_nouns[:5]))

In [23]:
l2CommonX = [l2PreProcessingXCommon(i) for i in dfCommon.Question.values]
l2CommonY = [l2PreProcessingY(i) for i in dfCommon.Equation.values]
l2CommonModel = RandomForestClassifier()
l2CommonModel.fit(l2CommonX,l2CommonY)

In [24]:
def predictionPreprocessing(text) :
    text = text.replace(".", "")
    text = text.replace(",", "")
    doc = nlp(text.lower())
    tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))
    verbs = []
    advs = []
    adjs = []
    nouns = []
    nums = []
    auxs = []
    count=0
    for i in tagged_words :
        if i[1]== "VERB":
            verbs.append(i[0])
        if i[1]== "ADV":
            advs.append(i[0])
        if i[1]== "ADJ":
            adjs.append(i[0])
        if i[1]== "AUX":
            auxs.append(i[0])
        if i[1]== "NOUN":
            nouns.append(i[0])
        if i[1]== "NUM" and i[0].isnumeric() :
              nums.append("number{}".format(count))
              count+=1



    verbs = list(dict.fromkeys([wordnet_lemmatizer.lemmatize(i,pos='v') for i in verbs]))


    encoded_verbs = [encoder.encode_word(i) for i in verbs]
    if len(encoded_verbs) < 5:
            encoded_verbs = np.pad(encoded_verbs, (0, 5 - len(encoded_verbs)))

    encoded_adjs = [encoder.encode_word(i) for i in adjs]
    if len(encoded_adjs) < 5:
            encoded_adjs = np.pad(encoded_adjs, (0, 5 - len(encoded_adjs)))

    encoded_advs = [encoder.encode_word(i) for i in advs]
    if len(encoded_advs) < 5:
            encoded_advs = np.pad(encoded_advs, (0, 5 - len(encoded_advs)))

    encoded_auxs = [encoder.encode_word(i) for i in auxs]
    if len(encoded_auxs) < 5:
            encoded_auxs = np.pad(encoded_auxs, (0, 5 - len(encoded_auxs)))

    encoded_nums = [encoder.encode_word(i) for i in nums]
    if len(encoded_nums) < 5:
            encoded_nums = np.pad(encoded_nums, (0, 5 - len(encoded_nums)))

    encoded_nouns = [encoder.encode_word(i) for i in nouns]
    if len(encoded_nouns) < 10:
            encoded_nouns = np.pad(encoded_nouns, (0, 10 - len(encoded_nouns)))
    encoded_nouns = []
    return np.hstack((encoded_verbs[:5] , encoded_advs[:5] , encoded_adjs[:5] , encoded_auxs[:5] , encoded_nums[:5] , encoded_nouns[:10]))

In [25]:
def predictFormula(text,model,predalgo) :
    formula = model.predict([predalgo(text)])
    formula = list(formula[0])
    L = []
    for i in formula :
        if i != 0  :
            L.append(encoder.decode_word(i))
        else :
            break
    nums = dict()
    doc = nlp(text.lower())
    tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))
    count = 0
    for i in tagged_words :
        if i[1]== "NUM" and i[0].isnumeric() :
              nums["number{}".format(count)] = i[0]
              count+=1
    final = [nums[i] if re.search('^number',i) else i for i in L]
    return(final)

In [26]:
def evalPrediction(text,model,predAlgo) :
    final = predictFormula(text,model,predAlgo)
    operators = {'+': lambda x, y: x + y,
                 '-': lambda x, y: x - y,
                 '*': lambda x, y: x * y,
                 '/': lambda x, y: x / y}

    stack = []
    for token in reversed(final):
        if token.isnumeric():
            stack.append(int(token))
        elif token in operators:
            op1 = stack.pop()
            op2 = stack.pop()
            result = operators[token](op1, op2)
            stack.append(result)
    answer = stack[-1]
    if answer < 0 :
        answer = -answer
    if answer < 1 and answer > 0 :
        answer = 1/answer   
    return answer

    

In [27]:
import time
def generateText(text) :
    t = predictType(text)
    delay = 0.02
    text1 = "Predicted Type of the question is {}.".format(t)
    predalgo = predictionPreprocessing
    if t=="General" :
        model = l2GeneralModel
    if t=="Addition" :
        model = l2AdditionModel
    if t=="Subtraction" :
        model = l2SubtractionModel
    if t=="Multiplication" :
        model = l2MultiplicationModel
    if t=="Common" :
        model = l2CommonModel
        predalgo = predictionPreprocessingCommon

    text2 = "The Equation generated for the above question is \"{}\".".format(" ".join(predictFormula(text,model,predalgo)))
    text3 = "The result predicted for the above question is {}.".format(evalPrediction(text,model,predalgo))
    for char in text1 :
        print(char,end="",flush=True)
        time.sleep(delay)
    print("\n")
    print("Using {} Model ...\n".format(t))
    for char in text2 :
        print(char,end="",flush=True)
        time.sleep(delay)
    print("\n")
    for char in text3 :
        print(char,end="",flush=True)
        time.sleep(delay)
    print("\n")


In [50]:
text = "A man starts walking with speed of 10 kilometers per hour. how much distance will he cover in 23 hours?"
generateText(text)

Predicted Type of the question is General.

Using General Model ...

The Equation generated for the above question is "- 10 23".

The result predicted for the above question is 13.

