# Transforming dataset to replace Named Entities

Numerical and named entities are replaced by the corresponding Named entity tag. Each token is POS tagged and lemmatized.

E.g:

*Joan found 70 seashells on the beach . she gave Sam some of her seashells . She has 27 seashell . How many seashells did she give to Sam ?*

will be converted to:

*PERSON find NUMBER seashell on the beach . she give PERSON some of she seashell . she have NUMBER seashell . how many seashell do she give to PERSON ?"*

In [22]:
from pycorenlp import StanfordCoreNLP
import json
from nltk.corpus import wordnet as wn

nlp = StanfordCoreNLP('http://localhost:9000')

#names of the files

'''

addition problems dataset:
add_1.json
add_2.json
add_3.json
add_4.json

subtraction problems:
sub_1.json
sub_2.json
sub_3.json
sub_4.json

'''

with open("data.json",'r') as f:
    dataset = json.load(f)
    
named = ['PERSON', 'LOCATION', 'ORGANIZATION', 'MISC']
numerical = ['MONEY', 'NUMBER', 'ORDINAL', 'PERCENT']


clean_problems = []

count = 0

for problem in dataset:

    new_problem = {}
    new_problem['iIndex'] = count
    new_problem['sQuestion'] = ''
    

    prob = problem['sQuestion']
    new_problem['origQuestion'] = prob
    
    res = nlp.annotate(prob, properties = {'annotators':"tokenize,ssplit,pos,lemma,ner", 'outputFormat':'json'})
  
    for j in range(len(res['sentences'])): # all sentences in the problem
       
        clean_sentence = []
    
        tokens = res['sentences'][j]['tokens']
        
        for i in range(len(tokens)): # all tokens in a sentence
        
            if tokens[i]['ner'] in named or tokens[i]['ner'] in numerical:
                clean_sentence.append(tokens[i]["ner"])
            
            else:
                clean_sentence.append(tokens[i]["lemma"]) 
        
        new_problem['sQuestion'] += ' '+' '.join(clean_sentence)

    clean_problems.append(new_problem)

    count += 1

    # writing result to file

with open('clean_lemma_ner_data.json', 'w') as f:
    json.dump(clean_problems, f)