In [179]:
import pandas as pd
import numpy as np
import re
from unidecode import unidecode
import nltk
import spacy
nlp = spacy.load('en_core_web_sm')

In [180]:
from gensim.models import Word2Vec,KeyedVectors

In [181]:
model = KeyedVectors.load_word2vec_format('./Data/GoogleNews-vectors-negative300.bin', binary=True)


In [182]:
class WordEncoder:
    def __init__(self):
        self.encoding_map = {}
        self.decoding_map = {}
        self.current_index = 43
    
    def encode_word(self, word):
        if word not in self.encoding_map:
            self.encoding_map[word] = self.current_index
            self.decoding_map[self.current_index] = word
            self.current_index += 1
        return self.encoding_map[word]
    
    def decode_word(self, encoded_word):
        if encoded_word in self.decoding_map:
            return self.decoding_map[encoded_word]
        else:
            #if encoded_word==0:
                
            print(f"Invalid encoded word: {self,encoded_word}")
            raise ValueError("Invalid encoded word.")

In [183]:
encoder = WordEncoder()

In [184]:
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()


In [185]:
df = pd.read_csv('final_1.csv').reset_index()[['Question','Equation','Type']].fillna("General")
df

Unnamed: 0,Question,Equation,Type
0,gino has number0 popsicle sticks . i have numb...,+ number0 number1,General
1,lino picked up number0 shells at the seashore ...,+ number0 number1,General
2,there were number0 parents in the program and ...,+ number0 number1,General
3,last saturday marie sold number0 magazines and...,+ number0 number1,General
4,there are number0 birds on the fence . number1...,+ number0 number1,General
...,...,...,...
6526,an industrial machine made number0 shirts yest...,/ + number0 number1 number2,General
6527,an industrial machine made number0 shirts yest...,/ number1 number2,General
6528,an industrial machine can make number0 shirts ...,/ + number1 number2 number0,General
6529,an industrial machine can make number0 shirts ...,* number0 + number1 number2,General


In [186]:
df.Type.unique()

array(['General', 'Common', 'LCM_HCF', 'Physics', 'Time'], dtype=object)

In [187]:
values_to_replace = ['Addition', 'Subtraction', 'Sum', 'TVQ-Change',
                     'Multiplication', 'Common-Division', 'TVQ-Final',
                     'TVQ-Initial', 'Ceil-Division', 'Floor-Division', 'Difference']

# Replace the values in the 'Type' column with 'General'
df['Type'] = df['Type'].replace(values_to_replace, 'General')

In [188]:
df.Type.unique()

array(['General', 'Common', 'LCM_HCF', 'Physics', 'Time'], dtype=object)

In [189]:
level1DataFrame = df[['Question','Type']]
level1DataFrame

Unnamed: 0,Question,Type
0,gino has number0 popsicle sticks . i have numb...,General
1,lino picked up number0 shells at the seashore ...,General
2,there were number0 parents in the program and ...,General
3,last saturday marie sold number0 magazines and...,General
4,there are number0 birds on the fence . number1...,General
...,...,...
6526,an industrial machine made number0 shirts yest...,General
6527,an industrial machine made number0 shirts yest...,General
6528,an industrial machine can make number0 shirts ...,General
6529,an industrial machine can make number0 shirts ...,General


In [190]:
def l1PreProcessingX(text):
        text = text.replace(".", " ")
        text = text.replace(",", "")
        doc = nlp(text.lower())
        tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))

        verbs_raw = tagged_words[tagged_words[:, 1] == 'VERB'][:, 0]
        verbs_raw = list(dict.fromkeys([wordnet_lemmatizer.lemmatize(i,pos='v') for i in verbs_raw]))
        encoded_verbs = [encoder.encode_word(i) for i in verbs_raw]
        if len(encoded_verbs) < 10:
                encoded_verbs = np.pad(encoded_verbs, (0, 10 - len(encoded_verbs)))

        
        
        adj_raw = tagged_words[tagged_words[:, 1] == 'ADJ'][:, 0]
        encoded_adjs = [encoder.encode_word(i) for i in adj_raw]
        if len(encoded_adjs) < 5:
                encoded_adjs = np.pad(encoded_adjs, (0, 5 - len(encoded_adjs)))

        adv_raw = tagged_words[tagged_words[:, 1] == 'ADV'][:, 0]
        encoded_advs = [encoder.encode_word(i) for i in adv_raw]
        if len(encoded_advs) < 5:
                encoded_advs = np.pad(encoded_advs, (0, 5 - len(encoded_advs)))

        aux_raw = tagged_words[tagged_words[:, 1] == 'AUX'][:, 0]
        encoded_auxs = [encoder.encode_word(i) for i in aux_raw]
        if len(encoded_auxs) < 5:
                encoded_auxs = np.pad(encoded_auxs, (0, 5 - len(encoded_auxs)))


        return np.array(list(encoded_verbs[:10]) + list(encoded_adjs[:5]) + list(encoded_advs[:5]) + list(encoded_auxs[:5]))


In [191]:
l1X = [l1PreProcessingX(i) for i in list(level1DataFrame['Question'].values)]

In [192]:
l1Y = [encoder.encode_word(i) for i in list(level1DataFrame['Type'].values)]

In [193]:
from sklearn.ensemble import RandomForestClassifier
l1model = RandomForestClassifier()
l1model.fit(l1X,l1Y)

In [194]:
def predictType(text) :
    return(encoder.decode_word(l1model.predict([l1PreProcessingX(text)])[0]))

In [195]:
df.drop_duplicates(subset=["Question"], keep="first", inplace=True)
df

Unnamed: 0,Question,Equation,Type
0,gino has number0 popsicle sticks . i have numb...,+ number0 number1,General
1,lino picked up number0 shells at the seashore ...,+ number0 number1,General
2,there were number0 parents in the program and ...,+ number0 number1,General
3,last saturday marie sold number0 magazines and...,+ number0 number1,General
4,there are number0 birds on the fence . number1...,+ number0 number1,General
...,...,...,...
6526,an industrial machine made number0 shirts yest...,/ + number0 number1 number2,General
6527,an industrial machine made number0 shirts yest...,/ number1 number2,General
6528,an industrial machine can make number0 shirts ...,/ + number1 number2 number0,General
6529,an industrial machine can make number0 shirts ...,* number0 + number1 number2,General


In [196]:
dfGeneral = df[df.Type=="General"][["Question","Equation"]]
dfCommon = df[df.Type=="Common"][["Question","Equation"]]
dfPhysics = df[df.Type=="Physics"][["Question","Equation"]]
dfTime = df[df.Type=="Time"][["Question","Equation"]]
dfLcm_Hcf = df[df.Type=="LCM_HCF"][["Question","Equation"]]



In [197]:
dfPhysics


Unnamed: 0,Question,Equation
4852,a fill pipe can fill number0 / number1 of cist...,/ number2 number0
4853,what is the distance covered by a train if it ...,* number0 number1
4854,one pipe can fill a tank four times as fast as...,* + 1 4 number0
4855,the speed of a boat in still water is number0 ...,- number0 number1
4856,a space shuttle orbits the earth at about numb...,* number0 3600
...,...,...
5020,a boat can travel with a speed of number0 km /...,/ number1 + number0 number0
5021,a boat running up stram takes number0 hours to...,- number1 number0
5022,a tank is filled in number0 hours by number1 p...,/ number0 / 2 + number1 4
5023,a man goes from a to b at a speed of number0 k...,/ + number0 number0 2


In [198]:
def l2PreProcessingX(text) :
    text = text.replace(".", "")
    text = text.replace(",", "")
    doc = nlp(text)
    tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))
    verbs = []
    advs = []
    adjs = []
    nouns = []
    nums = []
    auxs = []
    for i in tagged_words :
        
        if re.search('^number', i[0]) :
            nums.append(i[0])

        else :
            if i[1]== "VERB":
                verbs.append(i[0])
            if i[1]== "ADV":
                advs.append(i[0])
            if i[1]== "ADJ":
                adjs.append(i[0])
            if i[1]== "AUX":
                auxs.append(i[0])
            if i[1]== "NOUN":
                nouns.append(i[0])



    verbs = list(dict.fromkeys([wordnet_lemmatizer.lemmatize(i,pos='v') for i in verbs]))
    # print(verbs,advs,adjs,auxs,nouns)
    encoded_verbs = []
    for i in verbs :
          encoded_verbs.extend(model[i.capitalize()][:10])
    if len(encoded_verbs) < 50:
            encoded_verbs = np.pad(encoded_verbs, (0, 50 - len(encoded_verbs)))

    encoded_adjs = []
    for i in adjs :
          encoded_adjs.extend(model[i.capitalize()][:10])
    if len(encoded_adjs) < 30:
            encoded_adjs = np.pad(encoded_adjs, (0, 30 - len(encoded_adjs)))

    encoded_advs = []
    for i in advs :
          encoded_advs.extend(model[i.capitalize()][:10])
    if len(encoded_advs) < 30:
            encoded_advs = np.pad(encoded_advs, (0, 30 - len(encoded_advs)))

    encoded_auxs = []
    for i in auxs :
          encoded_auxs.extend(model[i.capitalize()][:10])
    if len(encoded_auxs) < 30:
            encoded_auxs = np.pad(encoded_auxs, (0, 30 - len(encoded_auxs)))

    encoded_nums = [encoder.encode_word(i) for i in nums]
    if len(encoded_nums) < 5:
            encoded_nums = np.pad(encoded_nums, (0, 5 - len(encoded_nums)))

    encoded_nouns = []
    for i in advs :
          encoded_nouns.extend(model[i.capitalize()][:10])
    if len(encoded_nouns) < 50:
            encoded_nouns = np.pad(encoded_nouns, (0, 50 - len(encoded_nouns)))
            
    return np.hstack((encoded_nums[:5], encoded_verbs[:50] , encoded_advs[:30] , encoded_adjs[:30] , encoded_auxs[:30]  , encoded_nouns[:50]))

In [199]:
def l2PreProcessingY(text) :
    formula = text.split(" ")
    encoded_formula = [encoder.encode_word(i) for i in formula]
    if len(encoded_formula) < 10:
            encoded_formula = np.pad(encoded_formula, (0, 10 - len(encoded_formula)))
    return np.array(list(encoded_formula[:10]))

In [200]:
l2generalX = []
l2generalY = []
for i in range(len(dfGeneral.Question.values)) :
    try :
        l2generalX.append(l2PreProcessingX(dfGeneral.Question.values[i]))
        l2generalY.append(l2PreProcessingY(dfGeneral.Equation.values[i]))
    except :
        pass


In [201]:
pd.DataFrame(l2generalX)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,185,186,187,188,189,190,191,192,193,194
0,140.0,55.0,0.0,0.0,0.0,-0.065918,-0.063477,-0.122559,0.373047,0.050293,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,140.0,55.0,0.0,0.0,0.0,0.018677,0.285156,0.088867,0.213867,0.421875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,140.0,55.0,0.0,0.0,0.0,0.166992,0.388672,-0.021484,0.255859,0.136719,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,140.0,55.0,1473.0,0.0,0.0,0.088379,0.075195,-0.122559,0.155273,-0.142578,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,140.0,55.0,0.0,0.0,0.0,0.166992,0.388672,-0.021484,0.255859,0.136719,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,140.0,55.0,129.0,0.0,0.0,-0.188477,0.036133,0.125977,0.202148,0.240234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5168,140.0,55.0,129.0,0.0,0.0,-0.188477,0.036133,0.125977,0.202148,0.240234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5169,140.0,55.0,129.0,0.0,0.0,-0.188477,0.036133,0.125977,0.202148,0.240234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5170,140.0,55.0,129.0,0.0,0.0,-0.188477,0.036133,0.125977,0.202148,0.240234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [202]:
l2GeneralModel = RandomForestClassifier()
l2GeneralModel.fit(l2generalX,l2generalY)

In [203]:
#y_train

In [204]:
l2PreProcessingX(dfGeneral.Question.values[10])

array([ 1.40000000e+02,  5.50000000e+01,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  2.55859375e-01, -3.02734375e-02, -1.66992188e-01,
        5.56640625e-02,  4.12109375e-01,  1.84570312e-01, -1.58203125e-01,
       -1.20239258e-02, -2.57568359e-02,  2.86865234e-03, -6.59179688e-02,
       -6.34765625e-02, -1.22558594e-01,  3.73046875e-01,  5.02929688e-02,
        3.44238281e-02, -2.65625000e-01, -7.66601562e-02, -3.44238281e-02,
        4.16015625e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

In [205]:
"""l2AdditionX = [l2PreProcessingX(i) for i in dfAddition.Question.values]
l2AdditionY = [l2PreProcessingY(i) for i in dfAddition.Equation.values]
l2AdditionModel = RandomForestClassifier()
l2AdditionModel.fit(l2AdditionX,l2AdditionY)"""

'l2AdditionX = [l2PreProcessingX(i) for i in dfAddition.Question.values]\nl2AdditionY = [l2PreProcessingY(i) for i in dfAddition.Equation.values]\nl2AdditionModel = RandomForestClassifier()\nl2AdditionModel.fit(l2AdditionX,l2AdditionY)'

In [206]:
"""l2SubtractionX = [l2PreProcessingX(i) for i in dfSubtraction.Question.values]
l2SubtractionY = [l2PreProcessingY(i) for i in dfSubtraction.Equation.values]
l2SubtractionModel = RandomForestClassifier()
l2SubtractionModel.fit(l2SubtractionX,l2SubtractionY)"""

'l2SubtractionX = [l2PreProcessingX(i) for i in dfSubtraction.Question.values]\nl2SubtractionY = [l2PreProcessingY(i) for i in dfSubtraction.Equation.values]\nl2SubtractionModel = RandomForestClassifier()\nl2SubtractionModel.fit(l2SubtractionX,l2SubtractionY)'

In [207]:
"""l2MultiplicationX = [l2PreProcessingX(i) for i in dfMultiplication.Question.values]
l2MultiplicationY = [l2PreProcessingY(i) for i in dfMultiplication.Equation.values]
l2MultiplicationModel = RandomForestClassifier()
l2MultiplicationModel.fit(l2MultiplicationX,l2MultiplicationY)"""

'l2MultiplicationX = [l2PreProcessingX(i) for i in dfMultiplication.Question.values]\nl2MultiplicationY = [l2PreProcessingY(i) for i in dfMultiplication.Equation.values]\nl2MultiplicationModel = RandomForestClassifier()\nl2MultiplicationModel.fit(l2MultiplicationX,l2MultiplicationY)'

In [208]:
"""l2CommonDivisionX = [l2PreProcessingX(i) for i in dfCommonDivision.Question.values]
l2CommonDivisionY = [l2PreProcessingY(i) for i in dfCommonDivision.Equation.values]
l2CommonDivisionModel = RandomForestClassifier()
l2CommonDivisionModel.fit(l2CommonDivisionX,l2CommonDivisionY)"""

'l2CommonDivisionX = [l2PreProcessingX(i) for i in dfCommonDivision.Question.values]\nl2CommonDivisionY = [l2PreProcessingY(i) for i in dfCommonDivision.Equation.values]\nl2CommonDivisionModel = RandomForestClassifier()\nl2CommonDivisionModel.fit(l2CommonDivisionX,l2CommonDivisionY)'

In [209]:
l2PhysicsX = []
l2PhysicsY = []
for i in range(len(dfPhysics.Question.values)) :
    try :
        l2PhysicsX.append(l2PreProcessingX(dfPhysics.Question.values[i]))
        l2PhysicsY.append(l2PreProcessingY(dfPhysics.Equation.values[i]))
    except :
        pass
l2PhysicsModel = RandomForestClassifier()
l2PhysicsModel.fit(l2PhysicsX,l2PhysicsY)

In [210]:
def l2PreProcessingXCommon(text) :
    text = text.replace(".", "")
    text = text.replace(",", "")
    doc = nlp(text.lower())
    tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))
    verbs = []
    advs = []
    adjs = []
    nouns = []
    nums = []
    auxs = []
    for i in tagged_words :
        
        if re.search('^number', i[0]) :
            nums.append(i[0])

        else :
            if i[1]== "VERB":
                verbs.append(i[0])
            if i[1]== "ADV":
                advs.append(i[0])
            if i[1]== "ADJ":
                adjs.append(i[0])
            if i[1]== "AUX":
                auxs.append(i[0])
            if i[1]== "NOUN":
                nouns.append(i[0])



    verbs = list(dict.fromkeys([wordnet_lemmatizer.lemmatize(i,pos='v') for i in verbs]))

    print(verbs,advs,adjs,auxs,nums,nouns)

    encoded_verbs = [encoder.encode_word(i) for i in verbs]
    if len(encoded_verbs) < 5:
            encoded_verbs = np.pad(encoded_verbs, (0, 5 - len(encoded_verbs)))

    encoded_adjs = [encoder.encode_word(i) for i in adjs]
    if len(encoded_adjs) < 5:
            encoded_adjs = np.pad(encoded_adjs, (0, 5 - len(encoded_adjs)))

    encoded_advs = [encoder.encode_word(i) for i in advs]
    if len(encoded_advs) < 5:
            encoded_advs = np.pad(encoded_advs, (0, 5 - len(encoded_advs)))

    encoded_auxs = [encoder.encode_word(i) for i in auxs]
    if len(encoded_auxs) < 5:
            encoded_auxs = np.pad(encoded_auxs, (0, 5 - len(encoded_auxs)))

    encoded_nums = [encoder.encode_word(i) for i in nums]
    if len(encoded_nums) < 5:
            encoded_nums = np.pad(encoded_nums, (0, 5 - len(encoded_nums)))

    encoded_nouns = [encoder.encode_word(i) for i in nouns]
    if len(encoded_nouns) < 10:
            encoded_nouns = np.pad(encoded_nouns, (0, 10 - len(encoded_nouns)))
    encoded_verbs = []
    return np.hstack((encoded_verbs[:5] , encoded_advs[:5] , encoded_adjs[:5] , encoded_auxs[:5] , encoded_nums[:5] , encoded_nouns[:5]))

In [211]:
dfGeneral

Unnamed: 0,Question,Equation
0,gino has number0 popsicle sticks . i have numb...,+ number0 number1
1,lino picked up number0 shells at the seashore ...,+ number0 number1
2,there were number0 parents in the program and ...,+ number0 number1
3,last saturday marie sold number0 magazines and...,+ number0 number1
4,there are number0 birds on the fence . number1...,+ number0 number1
...,...,...
6526,an industrial machine made number0 shirts yest...,/ + number0 number1 number2
6527,an industrial machine made number0 shirts yest...,/ number1 number2
6528,an industrial machine can make number0 shirts ...,/ + number1 number2 number0
6529,an industrial machine can make number0 shirts ...,* number0 + number1 number2


In [212]:
def predictionPreprocessingCommon(text) :
    text = text.replace(".", "")
    text = text.replace(",", "")
    doc = nlp(text.lower())
    tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))
    advs = []
    adjs = []
    nouns = []
    nums = []
    auxs = []
    count=0
    for i in tagged_words :
        if i[1]== "ADV":
            advs.append(i[0])
        if i[1]== "ADJ":
            adjs.append(i[0])
        if i[1]== "AUX":
            auxs.append(i[0])
        if i[1]== "NOUN":
            nouns.append(i[0])
        if i[1]== "NUM" and i[0].isnumeric() :
              nums.append("number{}".format(count))
              count+=1

    encoded_adjs = [encoder.encode_word(i) for i in adjs]
    if len(encoded_adjs) < 5:
            encoded_adjs = np.pad(encoded_adjs, (0, 5 - len(encoded_adjs)))

    encoded_advs = [encoder.encode_word(i) for i in advs]
    if len(encoded_advs) < 5:
            encoded_advs = np.pad(encoded_advs, (0, 5 - len(encoded_advs)))

    encoded_auxs = [encoder.encode_word(i) for i in auxs]
    if len(encoded_auxs) < 5:
            encoded_auxs = np.pad(encoded_auxs, (0, 5 - len(encoded_auxs)))

    encoded_nums = [encoder.encode_word(i) for i in nums]
    if len(encoded_nums) < 5:
            encoded_nums = np.pad(encoded_nums, (0, 5 - len(encoded_nums)))

    encoded_nouns = [encoder.encode_word(i) for i in nouns]
    if len(encoded_nouns) < 10:
            encoded_nouns = np.pad(encoded_nouns, (0, 10 - len(encoded_nouns)))
    return np.hstack((encoded_advs[:5] , encoded_adjs[:5] , encoded_auxs[:5] , encoded_nums[:5] , encoded_nouns[:5]))

In [213]:
l2CommonX = [l2PreProcessingXCommon(i) for i in dfCommon.Question.values]
l2CommonY = [l2PreProcessingY(i) for i in dfCommon.Equation.values]
l2CommonModel = RandomForestClassifier()
l2CommonModel.fit(l2CommonX,l2CommonY)

[] ['there'] ['many'] ['are'] ['number0'] ['seconds', 'hours']
[] ['there'] ['many'] ['are'] ['number0'] ['hours', 'seconds']
[] ['there'] ['many'] ['are'] ['number0'] ['seconds', 'minutes']
[] ['there'] ['many'] ['are'] ['number0'] ['minutes', 'seconds']
[] ['there'] ['many'] ['are'] ['number0'] ['minutes', 'hours']
[] ['there'] ['many'] ['are'] ['number0'] ['hours', 'minutes']


In [214]:
def predictionPreprocessing(text) :
    text = text.replace(".", "")
    text = text.replace(",", "")
    doc = nlp(text.lower())
    tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))
    verbs = []
    advs = []
    adjs = []
    nouns = []
    nums = []
    auxs = []
    count=0
    for i in tagged_words :
        if i[1]== "VERB":
            verbs.append(i[0])
        if i[1]== "ADV":
            advs.append(i[0])
        if i[1]== "ADJ":
            adjs.append(i[0])
        if i[1]== "AUX":
            auxs.append(i[0])
        if i[1]== "NOUN":
            nouns.append(i[0])
        if i[1]== "NUM" and i[0].isnumeric() :
              nums.append("number{}".format(count))
              count+=1



    verbs = list(dict.fromkeys([wordnet_lemmatizer.lemmatize(i,pos='v') for i in verbs]))
    # print(verbs,advs,adjs,auxs,nouns)
    encoded_verbs = []
    for i in verbs :
          encoded_verbs.extend(model[i.capitalize()][:10])
    if len(encoded_verbs) < 50:
            encoded_verbs = np.pad(encoded_verbs, (0, 50 - len(encoded_verbs)))

    encoded_adjs = []
    for i in adjs :
          encoded_adjs.extend(model[i.capitalize()][:10])
    if len(encoded_adjs) < 30:
            encoded_adjs = np.pad(encoded_adjs, (0, 30 - len(encoded_adjs)))

    encoded_advs = []
    for i in advs :
          encoded_advs.extend(model[i.capitalize()][:10])
    if len(encoded_advs) < 30:
            encoded_advs = np.pad(encoded_advs, (0, 30 - len(encoded_advs)))

    encoded_auxs = []
    for i in auxs :
          encoded_auxs.extend(model[i.capitalize()][:10])
    if len(encoded_auxs) < 30:
            encoded_auxs = np.pad(encoded_auxs, (0, 30 - len(encoded_auxs)))

    encoded_nums = [encoder.encode_word(i) for i in nums]
    if len(encoded_nums) < 5:
            encoded_nums = np.pad(encoded_nums, (0, 5 - len(encoded_nums)))

    encoded_nouns = []
    for i in advs :
          encoded_nouns.extend(model[i.capitalize()][:10])
    if len(encoded_nouns) < 50:
            encoded_nouns = np.pad(encoded_nouns, (0, 50 - len(encoded_nouns)))
            
    return np.hstack((encoded_nums[:5], encoded_verbs[:50] , encoded_advs[:30] , encoded_adjs[:30] , encoded_auxs[:30]  , encoded_nouns[:50]))

In [215]:
def predictFormula(text,model_ugh,predalgo) :
    formula = model_ugh.predict([predalgo(text)])
    #print (formula)
    formula = list(formula[0])
#print ("hjgjh",formula)
    L = []
    for i in formula :
        if i != 0  :
            L.append(encoder.decode_word(i))
        else :
            break
    print("L: ",L)
    nums = dict()
    doc = nlp(text.lower())
    tagged_words = np.array(list(map(lambda x: (x.text, x.pos_), doc)))
    count = 0
    for i in tagged_words :
        if i[1]== "NUM" and i[0].isnumeric() :
              nums["number{}".format(count)] = i[0]
              
              count+=1
    #print(nums)
    final = [nums[i] if re.search('^number',i) else i for i in L]
    return(final)

In [216]:
import math
def evalPrediction(text,model,predAlgo) :
    final = predictFormula(text,model,predAlgo)
    #print('final in eval:',final)
    operators = {'+': lambda x, y: x + y,
                 '-': lambda x, y: x - y,
                 '*': lambda x, y: x * y,
                 '/': lambda x, y: x / y,
                 '√': lambda x:math.sqrt(x),
                 '^': lambda x,y: x**y}

    stack = []
    for token in reversed(final):
        if token.isnumeric():
            stack.append(int(token))
        elif token in operators:
            op1 = stack.pop()
            op2 = stack.pop()
            result = operators[token](op1, op2)
            stack.append(result)
    answer = stack[-1]
    if answer < 0 :
        answer = -answer
    if answer < 1 and answer > 0 :
        answer = 1/answer   
    return answer

    

In [217]:
import time
def generateText(text) :
    t = predictType(text)
    delay = 0.02
    text1 = "Predicted Type of the question is {}.".format(t)
    predalgo = predictionPreprocessing
    if t=="General" :
        model = l2GeneralModel
    """if t=="Addition" :
        model = l2AdditionModel
    if t=="Subtraction" :
        model = l2SubtractionModel
    if t=="Multiplication" :
        model = l2MultiplicationModel"""
    if t=="Common" :
        model = l2CommonModel
        predalgo = predictionPreprocessingCommon
    if t=='Physics':
        model = l2PhysicsModel
        #predalgo = predictionPreprocessingCommon

    text2 = "The Equation generated for the above question is \"{}\".".format(" ".join(predictFormula(text,model,predalgo)))
    text3 = "The result predicted for the above question is {}.".format(evalPrediction(text,model,predalgo))
    for char in text1 :
        print(char,end="",flush=True)
        time.sleep(delay)
    print("\n")
    print("Using {} Model ...\n".format(t))
    for char in text2 :
        print(char,end="",flush=True)
        time.sleep(delay)
    print("\n")
    for char in text3 :
        print(char,end="",flush=True)
        time.sleep(delay)
    print("\n")


In [218]:
txt = "Rachel baked 24 cookies. She wants to share them equally among 6 friends. How many cookies will each friend receive?"
generateText(txt)


L:  ['/', 'number0', 'number1']
L:  ['/', 'number0', 'number1']
Predicted Type of the question is General.

Using General Model ...

The Equation generated for the above question is "/ 24 6".

The result predicted for the above question is 4.0.



In [219]:
txt = "How many seconds are there in 5 hours"
generateText(txt)


L:  ['*', '*', 'number0', '60', '60']
L:  ['*', '*', 'number0', '60', '60']
P

redicted Type of the question is Common.

Using Common Model ...

The Equation generated for the above question is "* * 5 60 60".

The result predicted for the above question is 18000.



In [220]:

txt = "A painter needed to paint 12 rooms in a building. Each room takes 7 hours to paint. If he already painted 5 rooms, how much longer will he take to paint the rest? "
generateText(txt)


L:  ['*', 'number1', '-', 'number0', 'number2']
L:  ['*', 'number1', '-', 'number0', 'number2']
Predicted Type of the question is General.

Using General Model ...

The Equation generated for the above question is "* 7 - 12 5".

The result predicted for the above question is 49.



In [221]:
text = 'Lisa wants to save $50. She saves $10 each week. How many weeks will it take for her to reach her goal?'
t = predictType(text)
generateText(text)
text1 = "Predicted Type of the question is {}.".format(t)
print(text1)

L:  ['/', 'number0', 'number1']
L:  ['/', 'number0', 'number1']
Predicted Type of the question is General.

Using General Model ...

The Equation generated for the above question is "/ 50 10".

The result predicted for the above question is 5.0.

Predicted Type of the question is General.


In [222]:
"""# Step 1: Decode y_train equations
decoded_y_train = [["".join(encoder.decode_word(i) for i in equation)] for equation in y_train]

# Step 2: Solve y_train equations to get true answers
true_answers = [evalPrediction(equation[0], model=None, predAlgo=None) for equation in decoded_y_train]

# Step 3: Predict y_pred equations using the trained model
y_pred = l2GeneralModel.predict(X_test)

# Step 4: Decode y_pred equations and solve them to get predicted answers
decoded_y_pred = [["".join(encoder.decode_word(i) for i in equation)] for equation in y_pred]
predicted_answers = [evalPrediction(equation[0], model=None, predAlgo=None) for equation in decoded_y_pred]

# Step 5: Compare true_answers and predicted_answers to calculate accuracy
def calculate_accuracy(true_answers, predicted_answers):
    correct_count = sum(1 for true, pred in zip(true_answers, predicted_answers) if abs(true - pred) < 1e-6)
    total_count = len(true_answers)
    return correct_count / total_count

accuracy = calculate_accuracy(true_answers, predicted_answers)
print("Accuracy:", accuracy)
"""

'# Step 1: Decode y_train equations\ndecoded_y_train = [["".join(encoder.decode_word(i) for i in equation)] for equation in y_train]\n\n# Step 2: Solve y_train equations to get true answers\ntrue_answers = [evalPrediction(equation[0], model=None, predAlgo=None) for equation in decoded_y_train]\n\n# Step 3: Predict y_pred equations using the trained model\ny_pred = l2GeneralModel.predict(X_test)\n\n# Step 4: Decode y_pred equations and solve them to get predicted answers\ndecoded_y_pred = [["".join(encoder.decode_word(i) for i in equation)] for equation in y_pred]\npredicted_answers = [evalPrediction(equation[0], model=None, predAlgo=None) for equation in decoded_y_pred]\n\n# Step 5: Compare true_answers and predicted_answers to calculate accuracy\ndef calculate_accuracy(true_answers, predicted_answers):\n    correct_count = sum(1 for true, pred in zip(true_answers, predicted_answers) if abs(true - pred) < 1e-6)\n    total_count = len(true_answers)\n    return correct_count / total_coun

In [223]:
"""from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Assuming you have l2generalX and l2generalY as your features and labels, respectively

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(l2generalX, l2generalY, test_size=0.2, random_state=42)

# Create and train the model
l2GeneralModel = RandomForestClassifier()
l2GeneralModel.fit(X_train, y_train)"""


'from sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestClassifier\n\n# Assuming you have l2generalX and l2generalY as your features and labels, respectively\n\n# Split the data into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(l2generalX, l2generalY, test_size=0.2, random_state=42)\n\n# Create and train the model\nl2GeneralModel = RandomForestClassifier()\nl2GeneralModel.fit(X_train, y_train)'

In [224]:
df_test=pd.read_csv("final_test_1.csv")
df_test

Unnamed: 0,Question,Equation,Type
0,number0 red apples and number1 green apples ar...,+ number0 number1,General
1,ellen has number0 more balls than marin . mari...,+ number0 number1,General
2,janet has number0 oranges and sharon has numbe...,+ number0 number1,General
3,allan brought number0 balloons and jake brough...,+ number0 number1,General
4,adam has number0 more apples than jackie . jac...,+ number0 number1,General
...,...,...,...
7113,there are number0 skittles in steven 's skittl...,/ + number0 number1 number2,General
7114,there are number0 skittles in steven 's skittl...,/ number1 number2,General
7115,there are number0 skittles in steven 's skittl...,/ number0 number3,General
7116,there are number0 students in a school . if th...,/ number0 number1,General


In [225]:
dfGeneral_t = df_test[df_test.Type=="General"][["Question","Equation"]]
dfCommon_t = df_test[df_test.Type=="Common"][["Question","Equation"]]
dfPhysics_t = df_test[df_test.Type=="Physics"][["Question","Equation"]]
dfTime_t = df_test[df_test.Type=="Time"][["Question","Equation"]]
dfLcm_Hcf_t = df_test[df_test.Type=="LCM_HCF"][["Question","Equation"]]

In [226]:
l2generalX_t = []
l2generalY_t = []
for i in range(len(dfGeneral_t.Question.values)) :
    try :
        l2generalX_t.append(l2PreProcessingX(dfGeneral_t.Question.values[i]))
        l2generalY_t.append(l2PreProcessingY(dfGeneral_t.Equation.values[i]))
    except :
        pass


In [227]:
y_test = l2generalY_t

In [228]:
y_test = [list(equation) for equation in y_test]
y_test

[[1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1544, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0,

In [229]:
y_pred = l2GeneralModel.predict(l2generalX_t)
y_pred = [list(equation) for equation in y_pred]
y_pred

[[1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 55, 140, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1544, 140, 55, 0, 0, 0, 0, 0, 0, 0],
 [1545, 140, 55, 0, 0, 0,

In [230]:
import numpy as np
from sklearn.metrics import accuracy_score

# Convert y_test and y_pred into NumPy arrays
y_test_array = np.array(y_test)
y_pred_array = np.array(y_pred)

# Flatten the arrays, as accuracy_score expects 1D arrays
y_test_flat = y_test_array.flatten()
y_pred_flat = y_pred_array.flatten()

# Calculate the accuracy
accuracy = accuracy_score(y_test_flat, y_pred_flat)

print("Accuracy:", accuracy*100)


Accuracy: 92.66464471403812
