In [32]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [16]:
POI_TAG = 0
STREET_TAG = 1
OTHER_TAG = 2

In [86]:
train_df = pd.read_csv("./scl-2021-ds/train.csv")
train_df = train_df.astype(object)
train_df['parsed'] = None

In [88]:
def to_tokens(string):
    string = string.replace(",", " , ")
    tokens = string.split()
    return tokens

In [112]:
def EditDistDP(str1, str2):
    """
    From https://www.geeksforgeeks.org/edit-distance-dp-5/
    """
    len1 = len(str1)
    len2 = len(str2)
 
    # Create a DP array to memoize result
    # of previous computations
    DP = [[0 for i in range(len1 + 1)] 
             for j in range(2)];
 
    # Base condition when second String
    # is empty then we remove all characters
    for i in range(0, len1 + 1):
        DP[0][i] = i
 
    # Start filling the DP
    # This loop run for every
    # character in second String
    for i in range(1, len2 + 1):
         
        # This loop compares the char from
        # second String with first String
        # characters
        for j in range(0, len1 + 1):
 
            # If first String is empty then
            # we have to perform add character
            # operation to get second String
            if (j == 0):
                DP[i % 2][j] = i
 
            # If character from both String
            # is same then we do not perform any
            # operation . here i % 2 is for bound
            # the row number.
            elif(str1[j - 1] == str2[i-1]):
                DP[i % 2][j] = DP[(i - 1) % 2][j - 1]
             
            # If character from both String is
            # not same then we take the minimum
            # from three specified operation
            else:
                DP[i%2][j] = min(
                    1 + DP[(i-1)%2][j], # insertion cost = 1
                    100 + DP[i%2][j-1], # deletion cost = 100
                    100 + DP[(i-1)%2][j-1], # substitution cost = 100
                )
             
    # After complete fill the DP array
    # if the len2 is even then we end
    # up in the 0th row else we end up
    # in the 1th row so we take len2 % 2
    # to get row
    return DP[len2 % 2][len1]

In [116]:
def find_exact(base_tokens, pattern_tokens):
    for start in range(len(base_tokens)):
        end = start + tokens_length
        if end > len(pattern_tokens):
            return -1
        if base_tokens[start:end] == pattern_tokens:
            return start
    return -1

In [132]:
def find_almost_exact(base_tokens, pattern_tokens):
    # Almost match
    best_dist = 1e9
    best_start = -1
    for start in range(len(base_tokens)):
        end = start + len(pattern_tokens)
        if end > len(base_tokens):
            break

        dist = EditDistDP(' '.join(base_tokens[start:end]), ' '.join(pattern_tokens))
        if dist < best_dist:
            best_dist = dist
            best_start = start
    return best_start

In [141]:
for idx, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):
    raw_address = row['raw_address']
    POI, street = row['POI/street'].split('/')
    
    raw_tokens = to_tokens(raw_address)
    POI_tokens = to_tokens(POI)
    street_tokens = to_tokens(street)
    
    outputs = [(x, OTHER_TAG, x) for x in raw_tokens]

    if len(POI_tokens) == 0 and len(street_tokens) == 0:
        train_df.at[idx, 'parsed'] = outputs
    elif len(POI_tokens) == 0:
        exact_loc = find_exact(raw_tokens, street_tokens)
        if exact_loc != -1:
            start, end = exact_loc, exact_loc + len(street_tokens)
            for x in range(start, end):
                outputs[x] = (outputs[x][0], STREET_TAG, street_tokens[x-start])
        else:
            almost_exact_loc = find_almost_exact(raw_tokens, street_tokens)
            if almost_exact_loc != -1:
                start, end = almost_exact_loc, almost_exact_loc + len(street_tokens)
                for x in range(start, end):
                    outputs[x] = (outputs[x][0], STREET_TAG, street_tokens[x-start])
            else:
                outputs = None
            
        train_df.at[idx, 'parsed'] = outputs
    elif len(street_tokens) == 0:
        exact_loc = find_exact(raw_tokens, POI_tokens)
        if exact_loc != -1:
            start, end = exact_loc, exact_loc + len(POI_tokens)
            for x in range(start, end):
                outputs[x] = (outputs[x][0], POI_TAG, POI_tokens[x-start])
        else:
            almost_exact_loc = find_almost_exact(raw_tokens, POI_tokens)
            if almost_exact_loc != -1:
                start, end = almost_exact_loc, almost_exact_loc + len(POI_tokens)
                for x in range(start, end):
                    outputs[x] = (outputs[x][0], POI_TAG, POI_tokens[x-start])   
            else:
                outputs = None
                
        train_df.at[idx, 'parsed'] = outputs
    else:
        # Do Street first
        exact_loc = find_exact(raw_tokens, street_tokens)
        if exact_loc != -1:
            start, end = exact_loc, exact_loc + len(street_tokens)
            for x in range(start, end):
                outputs[x] = (outputs[x][0], STREET_TAG, street_tokens[x-start])
        else:
            almost_exact_loc = find_almost_exact(raw_tokens, street_tokens)
            if almost_exact_loc != -1:
                start, end = almost_exact_loc, almost_exact_loc + len(street_tokens)
                for x in range(start, end):
                    outputs[x] = (outputs[x][0], STREET_TAG, street_tokens[x-start])
            else:
                outputs = None
        
        if outputs is not None:
            exact_loc = find_exact(raw_tokens, POI_tokens)
            if exact_loc != -1:
                start, end = exact_loc, exact_loc + len(POI_tokens)
                for x in range(start, end):
                    outputs[x] = (outputs[x][0], POI_TAG, POI_tokens[x-start])
            else:
                almost_exact_loc = find_almost_exact(raw_tokens, POI_tokens)
                if almost_exact_loc != -1:
                    start, end = almost_exact_loc, almost_exact_loc + len(POI_tokens)
                    for x in range(start, end):
                        outputs[x] = (outputs[x][0], POI_TAG, POI_tokens[x-start])   
                else:
                    outputs = None

        train_df.at[idx, 'parsed'] = outputs

  0%|          | 0/300000 [00:00<?, ?it/s]

In [142]:
train_df.to_csv("./scl-2021-ds/parsed_train.csv", index=False, index_label=False)

In [143]:
train_df

Unnamed: 0,id,raw_address,POI/street,parsed
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,"[(jl, 1, jl), (kapuk, 1, kapuk), (timur, 1, ti..."
1,1,"aye, jati sampurna",/,"[(aye, 2, aye), (,, 2, ,), (jati, 2, jati), (s..."
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,"[(setu, 2, setu), (siung, 1, siung), (119, 2, ..."
3,3,"toko dita, kertosono",toko dita/,"[(toko, 0, toko), (dita, 0, dita), (,, 2, ,), ..."
4,4,jl. orde baru,/jl. orde baru,"[(jl., 1, jl.), (orde, 1, orde), (baru, 1, baru)]"
...,...,...,...,...
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,"[(jend, 1, jend), (ahmad, 1, ahmad), (yani, 1,..."
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko,"[(raya, 1, raya), (cila, 1, cila), (kko, 1, kk..."
299997,299997,tanjung gusta jl. yaya 2 no 17,/,"[(tanjung, 2, tanjung), (gusta, 2, gusta), (jl..."
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/,"[(jalan, 2, jalan), (cipadu, 2, cipadu), (jaya..."
