In [147]:
import pandas as pd
import ast
import json
import copy
from tqdm.notebook import tqdm

In [129]:
POI_TAG = 0
STREET_TAG = 1
OTHER_TAG = 2

In [123]:
df = pd.read_csv("./scl-2021-ds/parsed_train.csv")
df = df.dropna()
df['parsed'] = df['parsed'].apply(ast.literal_eval)

## Creating Dictionaries

In [124]:
poi_dict = dict()  # e.g. {neg : {negeri : 10, negara : 1}}
street_dict = dict()
for i, curr_parsed in tqdm(enumerate(df['parsed'])):
    for (token, tag, corrected_token) in curr_parsed:
        if tag == POI_TAG:
            if token in poi_dict:
                if corrected_token not in poi_dict[token]:
                    (poi_dict[token])[corrected_token] = 1
                (poi_dict[token])[corrected_token] += 1
            else:
                poi_dict[token] = {corrected_token: 1}
        elif tag == STREET_TAG:
            if token in street_dict:
                if corrected_token not in street_dict[token]:
                    (street_dict[token])[corrected_token] = 1
                (street_dict[token])[corrected_token] += 1
            else:
                street_dict[token] = {corrected_token: 1}

0it [00:00, ?it/s]

## Post Processing

In [125]:
# change values into % of occurrence
for token in poi_dict:
    total_count = 0
    for correct_token in poi_dict[token]:
        total_count += (poi_dict[token])[correct_token]
    for correct_token in poi_dict[token]:
        (poi_dict[token])[correct_token] *= (100/total_count)

In [126]:
for token in street_dict:
    total_count = 0
    for correct_token in street_dict[token]:
        total_count += (street_dict[token])[correct_token]
    for correct_token in street_dict[token]:
        (street_dict[token])[correct_token] *= (100/total_count)

## Saving Dict to Json

In [127]:
# saving dict as json file
with open("postmodel_poi_correction_dict.json", "w") as poi_dict_file:
    json.dump(poi_dict, poi_dict_file, indent = 4)
poi_dict_file.close()

In [128]:
with open("postmodel_street_correction_dict.json", "w") as street_dict_file:
    json.dump(street_dict, street_dict_file, indent = 4)
street_dict_file.close()

## Replacing Words in Prediction File

In [133]:
output_df = pd.read_csv("output_bilstm.csv")

In [134]:
output_df

Unnamed: 0,id,POI/street
0,0,/s. par
1,1,/angg per
2,2,asma laun/mand imog
3,3,ud agung rej/raya nga
4,4,/cut mutia
...,...,...
49995,49995,toko mbak farid/
49996,49996,vie - tk. ridho kids/vete 3 cari
49997,49997,mart dan roti bakar malabar/nasio
49998,49998,graha indah/jl. mujair raya bambu apus


In [155]:
# new dicts mapping k => correct_token with maximum occurrence %
poi_dict_max = copy.deepcopy(poi_dict)
street_dict_max = copy.deepcopy(street_dict)

for k in poi_dict_max:
    poi_dict_max[k] = max(poi_dict_max[k], key=lambda key: (poi_dict_max[k])[key])
for k in street_dict_max:
    street_dict_max[k] = max(street_dict_max[k], key=lambda key: (street_dict_max[k])[key])

In [176]:
for i, row in output_df.iterrows():
    poi_street = row["POI/street"].split('/')
    poi = poi_street[0]
    street = poi_street[1]
    old_poi_street = poi + "/" + street
    if poi in poi_dict_max:
        poi = poi_dict_max[poi]
    if street in street_dict_max:
        street = street_dict_max[street]
    new_poi_street = poi + "/" + street
    if old_poi_street != new_poi_street:
        output_df.loc[i, "POI/street"] = new_poi_street

In [177]:
output_df.to_csv("post_output_bilstm_correction.csv", index=False, index_label=False)