In [1]:
import numpy as np
import pandas as pd 
import re
from nltk.tokenize import word_tokenize
from time import time
import pickle

In [2]:
df = pd.read_csv('train.csv')
poi_street_df = df["POI/street"].str.split("/", n = 1, expand = True) 
df["POI"]= poi_street_df[0] 
df["street"]= poi_street_df[1] 
df1 = df.drop(["POI/street"], axis=1) 
df1

Unnamed: 0,id,raw_address,POI,street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,,jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",,
2,2,setu siung 119 rt 5 1 13880 cipayung,,siung
3,3,"toko dita, kertosono",toko dita,
4,4,jl. orde baru,,jl. orde baru
...,...,...,...,...
299995,299995,jend ahmad yani 331 kertasari ciamis,,jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",,raya cila kko
299997,299997,tanjung gusta jl. yaya 2 no 17,,
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri,


In [3]:
def find_first_index(ra_split, st_split):
    
    if len(st_split) <= len(ra_split): # new
    
        num_iter = len(ra_split) - len(st_split) + 1
        overlap_list = []

        for i in range(num_iter):
            window = ra_split[i: i+len(st_split)]
            overlap = list(set(window) & set(st_split))
            overlap_list.append(len(overlap))

        max_overlap = [e for e in range(len(overlap_list)) if overlap_list[e] == max(overlap_list)]
        if len(max_overlap) == 1:
            return max_overlap[0]

        else:
            count_list = []
            for idx in max_overlap:
                subset_ra = ra_split[idx: idx+len(st_split)]
                count = 0
                for e in range(len(subset_ra)):
                    if subset_ra[e] not in st_split[e]:
                        count += 0
                    else:
                        count += 1
                count_list.append(count)
            index = count_list.index(max(count_list))
            return max_overlap[index]
    else:
        return 0

In [4]:
def fix_street_errors(row):
    
    raw_add = row['raw_address']

    # If a street name is extracted...
    if row['street'] != "":

        raw_add_split = word_tokenize(raw_add)
        
        extr_street = row['street']
        extr_street_split = word_tokenize(extr_street)
        
        # If the extracted street is in the raw address as an entire string, good!
        if extr_street in raw_add:
            return raw_add
        
        # This is where there are discrepancies!
        else:
            index_in_ra = find_first_index(raw_add_split, extr_street_split)
            raw_add_split[index_in_ra: index_in_ra+len(extr_street_split)] = extr_street_split
            updated_raw_add = ' '.join(raw_add_split).replace(' ,', ',').replace(' .', '.').replace(' )', ')').replace(' (', '(').replace(' ?', '?')          
            return updated_raw_add
      
    # If a street name is originally an empty string, we just assume there's no error. 
    else:
        return raw_add

In [5]:
start = time()

df1['cleaned_raw_add'] = df1.apply(fix_street_errors, axis=1)

print("Executed in {} minutes.".format(round((time() - start)/60, 3)))

# Sanity checks
df1.loc[[69, 86, 117, 130, 135, 169], :]

Executed in 0.942 minutes.


Unnamed: 0,id,raw_address,POI,street,cleaned_raw_add
69,69,cak 11 nagasari karawang barat,,cakrad,cakrad 11 nagasari karawang barat
86,86,simpang tiga kah nasu no 112 28284 bukit raya,,kaharu nasu,simpang tiga kaharu nasu no 112 28284 bukit raya
117,117,ahmad dah iv kukusan beji,,ahmad dahlan iv,ahmad dahlan iv kukusan beji
130,130,mangla rege no h1 3 cimekar kel. cileunyi,,manglayang rege,manglayang rege no h1 3 cimekar kel. cileunyi
135,135,"cipinang besar selatan lintas ibadah, cipi jay...",lintas ibadah,cipinang jaya 1a,"cipinang besar selatan lintas ibadah, cipinang..."
169,169,"taman mer, 13 electr laun system, 2 meruya utara",electrolux laundry system,taman meruya,"taman meruya, 13 electr laun system, 2 meruya ..."


In [6]:
def get_street_mapping_dict(row):
    
    raw_add = row['raw_address']

    # If a street name is extracted...
    if row['street'] != "":

        raw_add_split = word_tokenize(raw_add)
        
        extr_street = row['street']
        extr_street_split = word_tokenize(extr_street)
        
        # If the extracted street is in the raw address as an entire string, good!
        if extr_street in raw_add:
            return None
        
        # This is where there are discrepancies!
        else:
            index_in_ra = find_first_index(raw_add_split, extr_street_split)
            before = raw_add_split[index_in_ra: index_in_ra+len(extr_street_split)] 
            before = ' '.join(before)
            return before, extr_street
      
    # If a street name is originally an empty string, we just assume there's no error. 
    else:
        return None


In [7]:
start = time()

df1['street_mapping'] = df1.apply(get_street_mapping_dict, axis=1)

print("Executed in {} minutes.".format(round((time() - start)/60, 3)))

# Sanity checks
df1.loc[[69, 86, 117, 130, 135, 169], :]

Executed in 0.917 minutes.


Unnamed: 0,id,raw_address,POI,street,cleaned_raw_add,street_mapping
69,69,cak 11 nagasari karawang barat,,cakrad,cakrad 11 nagasari karawang barat,"(cak, cakrad)"
86,86,simpang tiga kah nasu no 112 28284 bukit raya,,kaharu nasu,simpang tiga kaharu nasu no 112 28284 bukit raya,"(kah nasu, kaharu nasu)"
117,117,ahmad dah iv kukusan beji,,ahmad dahlan iv,ahmad dahlan iv kukusan beji,"(ahmad dah iv, ahmad dahlan iv)"
130,130,mangla rege no h1 3 cimekar kel. cileunyi,,manglayang rege,manglayang rege no h1 3 cimekar kel. cileunyi,"(mangla rege, manglayang rege)"
135,135,"cipinang besar selatan lintas ibadah, cipi jay...",lintas ibadah,cipinang jaya 1a,"cipinang besar selatan lintas ibadah, cipinang...","(cipi jaya 1a, cipinang jaya 1a)"
169,169,"taman mer, 13 electr laun system, 2 meruya utara",electrolux laundry system,taman meruya,"taman meruya, 13 electr laun system, 2 meruya ...","(taman mer, taman meruya)"


In [8]:
# Create a separate dataframe containing only values in `street_mapping` columns, i.e., rows where changes occurred
df_with_street_mappings = df1[df1['street_mapping'].notnull()]

street_mapping_dict = dict()

# Create a mapping dictionary, where key is the truncated word/words and the value is the corresponding correct word/words
for row, col in df_with_street_mappings.iterrows():
    street_mapping_dict[col['street_mapping'][0]] = col['street_mapping'][1]
    
# How many street errors are there altogether?
print("Length of street mapping dictionary:", len(street_mapping_dict))


Length of street mapping dictionary: 10591


In [9]:
def fix_poi_errors(row):
    
    raw_add = row['cleaned_raw_add']

    # If a POI name is extracted...
    if row['POI'] != "":

        raw_add_split = word_tokenize(raw_add)
        
        extr_poi = row['POI']
        extr_poi_split = word_tokenize(extr_poi)
        
        # If the extracted POI is in the raw address as an entire string, good!
        if extr_poi in raw_add:
            return raw_add
        
        # This is where there are discrepancies!
        else:
            index_in_ra = find_first_index(raw_add_split, extr_poi_split)
            raw_add_split[index_in_ra: index_in_ra+len(extr_poi_split)] = extr_poi_split
            updated_raw_add = ' '.join(raw_add_split).replace(' ,', ',').replace(' .', '.').replace(' )', ')').replace(' (', '(').replace(' ?', '?')          
            return updated_raw_add
      
    # If a POI name is originally an empty string, we just assume there's no error. 
    else:
        return raw_add

In [10]:
start = time()

df1['cleaned_raw_add_1'] = df1.apply(fix_poi_errors, axis=1)

print("Executed in {} minutes.".format(round((time() - start)/60, 3)))

# Sanity checks
df1.loc[[10, 11, 40, 110, 152, 157, 169], :]

Executed in 0.537 minutes.


Unnamed: 0,id,raw_address,POI,street,cleaned_raw_add,street_mapping,cleaned_raw_add_1
10,10,"cikahuripan sd neg boj 02 klap boj, no 5 16877",sd negeri bojong 02,klap boj,"cikahuripan sd neg boj 02 klap boj, no 5 16877",,"cikahuripan sd negeri bojong 02 klap boj, no 5..."
11,11,"yaya atohar,",yayasan atohariyah,,"yaya atohar,",,"yayasan atohariyah,"
40,40,mar tabl metro iringmulyo metro timur,markaz tabligh metro,,mar tabl metro iringmulyo metro timur,,markaz tabligh metro iringmulyo metro timur
110,110,"cv. hin oto kenc, sido ii, q 29",cv. hingdi oto kencana,sido ii,"cv. hin oto kenc, sido ii, q 29",,"cv. hingdi oto kencana, sido ii, q 29"
152,152,"ujung harapan, no 48a pd. karya warga mand cim...",pd. karya warga mandiri cimb,ujung harapan,"ujung harapan, no 48a pd. karya warga mand cim...",,"ujung harapan, no 48a pd. karya warga mandiri ..."
157,157,"alun - alun kuli nusan, kar",alun - alun kuliner nusantara,kar,"alun - alun kuli nusan, kar",,"alun - alun kuliner nusantara, kar"
169,169,"taman mer, 13 electr laun system, 2 meruya utara",electrolux laundry system,taman meruya,"taman meruya, 13 electr laun system, 2 meruya ...","(taman mer, taman meruya)","taman meruya, 13 electrolux laundry system, 2 ..."


In [11]:
def get_poi_mapping_dict(row):
    
    raw_add = row['cleaned_raw_add']

    # If a POI name is extracted...
    if row['POI'] != "":

        raw_add_split = word_tokenize(raw_add)
        
        extr_poi = row['POI']
        extr_poi_split = word_tokenize(extr_poi)
        
        # If the extracted POI is in the raw address as an entire string, good!
        if extr_poi in raw_add:
            return None
        
        # This is where there are discrepancies!
        else:
            index_in_ra = find_first_index(raw_add_split, extr_poi_split)
            before = raw_add_split[index_in_ra: index_in_ra+len(extr_poi_split)] 
            before = ' '.join(before)
            return before, extr_poi
      
    # If a POI name is originally an empty string, we just assume there's no error. 
    else:
        return None

In [12]:
start = time()

df1['poi_mapping'] = df1.apply(get_poi_mapping_dict, axis=1)

print("Executed in {} minutes.".format(round((time() - start)/60, 3)))

# Sanity checks
df1.loc[[10, 11, 40, 110, 152, 157, 169], :]

Executed in 0.533 minutes.


Unnamed: 0,id,raw_address,POI,street,cleaned_raw_add,street_mapping,cleaned_raw_add_1,poi_mapping
10,10,"cikahuripan sd neg boj 02 klap boj, no 5 16877",sd negeri bojong 02,klap boj,"cikahuripan sd neg boj 02 klap boj, no 5 16877",,"cikahuripan sd negeri bojong 02 klap boj, no 5...","(sd neg boj 02, sd negeri bojong 02)"
11,11,"yaya atohar,",yayasan atohariyah,,"yaya atohar,",,"yayasan atohariyah,","(yaya atohar, yayasan atohariyah)"
40,40,mar tabl metro iringmulyo metro timur,markaz tabligh metro,,mar tabl metro iringmulyo metro timur,,markaz tabligh metro iringmulyo metro timur,"(mar tabl metro, markaz tabligh metro)"
110,110,"cv. hin oto kenc, sido ii, q 29",cv. hingdi oto kencana,sido ii,"cv. hin oto kenc, sido ii, q 29",,"cv. hingdi oto kencana, sido ii, q 29","(cv . hin oto kenc, cv. hingdi oto kencana)"
152,152,"ujung harapan, no 48a pd. karya warga mand cim...",pd. karya warga mandiri cimb,ujung harapan,"ujung harapan, no 48a pd. karya warga mand cim...",,"ujung harapan, no 48a pd. karya warga mandiri ...","(pd . karya warga mand cimb, pd. karya warga m..."
157,157,"alun - alun kuli nusan, kar",alun - alun kuliner nusantara,kar,"alun - alun kuli nusan, kar",,"alun - alun kuliner nusantara, kar","(alun - alun kuli nusan, alun - alun kuliner n..."
169,169,"taman mer, 13 electr laun system, 2 meruya utara",electrolux laundry system,taman meruya,"taman meruya, 13 electr laun system, 2 meruya ...","(taman mer, taman meruya)","taman meruya, 13 electrolux laundry system, 2 ...","(electr laun system, electrolux laundry system)"


In [13]:
# Create a separate dataframe containing only values in `poi_mapping` columns, i.e., rows where changes occurred
df_with_poi_mappings = df1[df1['poi_mapping'].notnull()]

poi_mapping_dict = dict()

# Create a mapping dictionary, where key is the truncated word/words and the value is the corresponding correct word/words
for row, col in df_with_poi_mappings.iterrows():
    poi_mapping_dict[col['poi_mapping'][0]] = col['poi_mapping'][1]
    
# How many POI errors are there altogether?
print("Length of POI mapping dictionary:", len(poi_mapping_dict))

Length of POI mapping dictionary: 43382


In [14]:
# Select required columns
cleaned_df = df1[['id','cleaned_raw_add_1', 'POI', 'street']]

# Rename columns
cleaned_df.columns = ['id', 'raw_address', 'POI', 'street']

# Sanity checks
cleaned_df.loc[[10, 11, 40, 69, 86, 110, 117, 130, 135, 152, 157, 169], :]

Unnamed: 0,id,raw_address,POI,street
10,10,"cikahuripan sd negeri bojong 02 klap boj, no 5...",sd negeri bojong 02,klap boj
11,11,"yayasan atohariyah,",yayasan atohariyah,
40,40,markaz tabligh metro iringmulyo metro timur,markaz tabligh metro,
69,69,cakrad 11 nagasari karawang barat,,cakrad
86,86,simpang tiga kaharu nasu no 112 28284 bukit raya,,kaharu nasu
110,110,"cv. hingdi oto kencana, sido ii, q 29",cv. hingdi oto kencana,sido ii
117,117,ahmad dahlan iv kukusan beji,,ahmad dahlan iv
130,130,manglayang rege no h1 3 cimekar kel. cileunyi,,manglayang rege
135,135,"cipinang besar selatan lintas ibadah, cipinang...",lintas ibadah,cipinang jaya 1a
152,152,"ujung harapan, no 48a pd. karya warga mandiri ...",pd. karya warga mandiri cimb,ujung harapan


In [15]:
# Load test dataset
test_df = pd.read_csv('test.csv')

# Preview
test_df.head()

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"


In [16]:
new_street_mapping_dict = {k: v for k, v in street_mapping_dict.items() if len(k.split()) > 1}
new_poi_mapping_dict = {k: v for k, v in poi_mapping_dict.items() if len(k.split()) > 1}

print("Length of street mapping dictionary after removing single words:", len(new_street_mapping_dict))
print("Length of POI mapping dictionary after removing single words:", len(new_poi_mapping_dict))

Length of street mapping dictionary after removing single words: 9111
Length of POI mapping dictionary after removing single words: 41735


In [17]:
start = time()

# Replace truncated words in raw_address of test set with correct street labels
count1 = 0
for row, col in test_df.iterrows():
    for k, v in new_street_mapping_dict.items():
        if k in col['raw_address']:
            test_df.loc[row, 'raw_address'] = test_df.loc[row, 'raw_address'].replace(k, v)
            count1 += 1
            
print("Number of raw addresses updated due to errors in street labels:", count1)

# Replace truncated words in raw_address of test set with correct POI labels
count2 = 0
for row, col in test_df.iterrows():
    for k, v in new_poi_mapping_dict.items():
        if k in col['raw_address']:
            test_df.loc[row, 'raw_address'] = test_df.loc[row, 'raw_address'].replace(k, v)
            count2 += 1

print("Number of raw addresses updated due to errors in POI labels:", count2)

print("Executed in {} minutes.".format(round((time() - start)/60, 3)))

test_df

Number of raw addresses updated due to errors in street labels: 18913
Number of raw addresses updated due to errors in POI labels: 4553
Executed in 183.043 minutes.


Unnamed: 0,id,raw_address
0,0,s. parman 53 sidanegara 4 cilacap tengah
1,1,"anggrek per, baloi indah kel. lubuk baja"
2,2,"asma laun, mangund imog,"
3,3,"ud agung rejeki, raya ngawi- sri wedari karang..."
4,4,"cut mutia, 35 baiturrahman"
...,...,...
49995,49995,toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi"
49997,49997,"mart dan roti bakar malabar, nasio,"
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...


In [18]:
# Save the cleaned train dataset
cleaned_df.to_csv('cleaned_train.csv', index=False)

# Save the cleaned test dataset
test_df.to_csv('cleaned_test.csv', index=False)

In [19]:
# Save the dictionaries
s_file = open("street_mapping_dict.pkl", "wb")
pickle.dump(street_mapping_dict, s_file)
s_file.close()

p_file = open("poi_mapping_dict.pkl", "wb")
pickle.dump(street_mapping_dict, p_file)
p_file.close()

In [41]:
test_df

Unnamed: 0,id,raw_address
0,0,s. parman 53 sidanegara 4 cilacap tengah
1,1,"anggrek per, baloi indah kel. lubuk baja"
2,2,"asma laun, mangund imog,"
3,3,"ud agung rejeki, raya ngawi- sri wedari karang..."
4,4,"cut mutia, 35 baiturrahman"
...,...,...
49995,49995,toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi"
49997,49997,"mart dan roti bakar malabar, nasio,"
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...


In [91]:
test_clean = pd.read_csv('cleaned_test.csv')
test_clean

Unnamed: 0,id,raw_address
0,0,s. parman 53 sidanegara 4 cilacap tengah
1,1,"anggrek per, baloi indah kel. lubuk baja"
2,2,"asma laun, mangund imog,"
3,3,"ud agung rejeki, raya ngawi- sri wedari karang..."
4,4,"cut mutia, 35 baiturrahman"
...,...,...
49995,49995,toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi"
49997,49997,"mart dan roti bakar malabar, nasio,"
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...


In [94]:
#split col poi and street
test_clean['street'] = test_clean['raw_address'].str.split("jl", n = 0, expand = True)[0]
test_clean['POI'] = test_clean['raw_address'].str.split(",", n = 0, expand = True)[1]

In [95]:
test_clean

Unnamed: 0,id,raw_address,street,POI
0,0,s. parman 53 sidanegara 4 cilacap tengah,s. parman 53 sidanegara 4 cilacap tengah,
1,1,"anggrek per, baloi indah kel. lubuk baja","anggrek per, baloi indah kel. lubuk baja",baloi indah kel. lubuk baja
2,2,"asma laun, mangund imog,","asma laun, mangund imog,",mangund imog
3,3,"ud agung rejeki, raya ngawi- sri wedari karang...","ud agung rejeki, raya ngawi- sri wedari karang...",raya ngawi- sri wedari karanganyar
4,4,"cut mutia, 35 baiturrahman","cut mutia, 35 baiturrahman",35 baiturrahman
...,...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid semboro semboro,
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi","vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vete 3 cari
49997,49997,"mart dan roti bakar malabar, nasio,","mart dan roti bakar malabar, nasio,",nasio
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...,graha indah pamulang,


In [96]:
test_clean.replace(to_replace=[None], value="", inplace=True)
test_clean

Unnamed: 0,id,raw_address,street,POI
0,0,s. parman 53 sidanegara 4 cilacap tengah,s. parman 53 sidanegara 4 cilacap tengah,
1,1,"anggrek per, baloi indah kel. lubuk baja","anggrek per, baloi indah kel. lubuk baja",baloi indah kel. lubuk baja
2,2,"asma laun, mangund imog,","asma laun, mangund imog,",mangund imog
3,3,"ud agung rejeki, raya ngawi- sri wedari karang...","ud agung rejeki, raya ngawi- sri wedari karang...",raya ngawi- sri wedari karanganyar
4,4,"cut mutia, 35 baiturrahman","cut mutia, 35 baiturrahman",35 baiturrahman
...,...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid semboro semboro,
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi","vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vete 3 cari
49997,49997,"mart dan roti bakar malabar, nasio,","mart dan roti bakar malabar, nasio,",nasio
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...,graha indah pamulang,


In [97]:
data_submission_test=test_clean[['id','raw_address','street','POI']]
data_submission_test['POI/street']=data_submission_test['POI'].str.cat(data_submission_test['street'], sep='/')
data_submission_test

Unnamed: 0,id,raw_address,street,POI,POI/street
0,0,s. parman 53 sidanegara 4 cilacap tengah,s. parman 53 sidanegara 4 cilacap tengah,,/s. parman 53 sidanegara 4 cilacap tengah
1,1,"anggrek per, baloi indah kel. lubuk baja","anggrek per, baloi indah kel. lubuk baja",baloi indah kel. lubuk baja,"baloi indah kel. lubuk baja/anggrek per, balo..."
2,2,"asma laun, mangund imog,","asma laun, mangund imog,",mangund imog,"mangund imog/asma laun, mangund imog,"
3,3,"ud agung rejeki, raya ngawi- sri wedari karang...","ud agung rejeki, raya ngawi- sri wedari karang...",raya ngawi- sri wedari karanganyar,raya ngawi- sri wedari karanganyar/ud agung r...
4,4,"cut mutia, 35 baiturrahman","cut mutia, 35 baiturrahman",35 baiturrahman,"35 baiturrahman/cut mutia, 35 baiturrahman"
...,...,...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid semboro semboro,,/toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi","vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vete 3 cari,"vete 3 cari/vie - tk. ridho kids, vete 3 cari..."
49997,49997,"mart dan roti bakar malabar, nasio,","mart dan roti bakar malabar, nasio,",nasio,"nasio/mart dan roti bakar malabar, nasio,"
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...,graha indah pamulang,,/graha indah pamulang


In [98]:
data_submission_testing=data_submission_test[['id','POI/street']]
data_submission_testing

Unnamed: 0,id,POI/street
0,0,/s. parman 53 sidanegara 4 cilacap tengah
1,1,"baloi indah kel. lubuk baja/anggrek per, balo..."
2,2,"mangund imog/asma laun, mangund imog,"
3,3,raya ngawi- sri wedari karanganyar/ud agung r...
4,4,"35 baiturrahman/cut mutia, 35 baiturrahman"
...,...,...
49995,49995,/toko mbak farid semboro semboro
49996,49996,"vete 3 cari/vie - tk. ridho kids, vete 3 cari..."
49997,49997,"nasio/mart dan roti bakar malabar, nasio,"
49998,49998,/graha indah pamulang


In [100]:
data_submission_testing.to_csv('submission-faraway.csv', index=False)

# Part2

In [101]:
import numpy as np
import pandas as pd
import re

In [109]:
test_clean2 = pd.read_csv('cleaned_test.csv')
test_clean2

Unnamed: 0,id,raw_address
0,0,s. parman 53 sidanegara 4 cilacap tengah
1,1,"anggrek per, baloi indah kel. lubuk baja"
2,2,"asma laun, mangund imog,"
3,3,"ud agung rejeki, raya ngawi- sri wedari karang..."
4,4,"cut mutia, 35 baiturrahman"
...,...,...
49995,49995,toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi"
49997,49997,"mart dan roti bakar malabar, nasio,"
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...


In [110]:
#split col poi and street
test_clean2['street'] = test_clean2['raw_address'].str.split(",", n = 0, expand = True)[0]
test_clean2['POI'] = test_clean2['raw_address'].str.split(",", n = 0, expand = True)[1]
test_clean2

Unnamed: 0,id,raw_address,street,POI
0,0,s. parman 53 sidanegara 4 cilacap tengah,s. parman 53 sidanegara 4 cilacap tengah,
1,1,"anggrek per, baloi indah kel. lubuk baja",anggrek per,baloi indah kel. lubuk baja
2,2,"asma laun, mangund imog,",asma laun,mangund imog
3,3,"ud agung rejeki, raya ngawi- sri wedari karang...",ud agung rejeki,raya ngawi- sri wedari karanganyar
4,4,"cut mutia, 35 baiturrahman",cut mutia,35 baiturrahman
...,...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid semboro semboro,
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vie - tk. ridho kids,vete 3 cari
49997,49997,"mart dan roti bakar malabar, nasio,",mart dan roti bakar malabar,nasio
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...,graha indah pamulang jl. mujair raya bambanbu ...,


In [111]:
POI_in_raw_address = 12321

In [112]:
test_clean2['POI_in_raw_address'] = np.where(test_clean2['POI'].isnull(), False, test_clean2.apply(lambda x: str(x.POI) in str(x.raw_address), axis=1))
mapping_improved_POI = test_clean2['POI_in_raw_address'].sum() >= POI_in_raw_address
print("Mapping improved POI is in raw_address:", mapping_improved_POI, test_clean2['POI_in_raw_address'].sum())

Mapping improved POI is in raw_address: True 25486


In [113]:
test_clean2

Unnamed: 0,id,raw_address,street,POI,POI_in_raw_address
0,0,s. parman 53 sidanegara 4 cilacap tengah,s. parman 53 sidanegara 4 cilacap tengah,,False
1,1,"anggrek per, baloi indah kel. lubuk baja",anggrek per,baloi indah kel. lubuk baja,True
2,2,"asma laun, mangund imog,",asma laun,mangund imog,True
3,3,"ud agung rejeki, raya ngawi- sri wedari karang...",ud agung rejeki,raya ngawi- sri wedari karanganyar,True
4,4,"cut mutia, 35 baiturrahman",cut mutia,35 baiturrahman,True
...,...,...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid semboro semboro,,False
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vie - tk. ridho kids,vete 3 cari,True
49997,49997,"mart dan roti bakar malabar, nasio,",mart dan roti bakar malabar,nasio,True
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...,graha indah pamulang jl. mujair raya bambanbu ...,,False


In [116]:
street_in_raw_address = 32123
test_clean2['street_in_raw_address'] = np.where(test_clean2['street'].isnull(), False, test_clean2.apply(lambda x: str(x.street) in str(x.raw_address), axis=1))
mapping_improved_street = test_clean2['street_in_raw_address'].sum() >= street_in_raw_address
print("Mapping improved street is in raw_address:",mapping_improved_street, test_clean2['street_in_raw_address'].sum())

Mapping improved street is in raw_address: True 50000


In [117]:
def tokenize_address_by_sep(test_clean2, sep = ' '):
    test_clean2['raw_address'] = test_clean2['raw_address'].str.replace(",",", ")
    test_clean2['raw_address'] = test_clean2['raw_address'].str.replace(", "," , ")
    test_clean2['raw_address'] = test_clean2['raw_address'].str.replace("  "," ")
    data = pd.concat([pd.Series(row['id'], row['raw_address'].split(sep))
                      for _, row in test_clean2.iterrows()]).reset_index()
    data.columns = ['word', 'sentence_idx']
    data['tag'] = np.nan
    
    # auto tag all commas as comma
    data['tag'] = np.where(data['word']==',', 'comma', data['tag'])
    
    data = data[['sentence_idx','word','tag']]
    return data

In [119]:
test_clean2

Unnamed: 0,id,raw_address,street,POI,POI_in_raw_address,street_in_raw_address
0,0,s. parman 53 sidanegara 4 cilacap tengah,s. parman 53 sidanegara 4 cilacap tengah,,False,True
1,1,"anggrek per, baloi indah kel. lubuk baja",anggrek per,baloi indah kel. lubuk baja,True,True
2,2,"asma laun, mangund imog,",asma laun,mangund imog,True,True
3,3,"ud agung rejeki, raya ngawi- sri wedari karang...",ud agung rejeki,raya ngawi- sri wedari karanganyar,True,True
4,4,"cut mutia, 35 baiturrahman",cut mutia,35 baiturrahman,True,True
...,...,...,...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid semboro semboro,,False,True
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vie - tk. ridho kids,vete 3 cari,True,True
49997,49997,"mart dan roti bakar malabar, nasio,",mart dan roti bakar malabar,nasio,True,True
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...,graha indah pamulang jl. mujair raya bambanbu ...,,False,True


In [120]:
test_tokens = tokenize_address_by_sep(test)
test_tokens['tag'] = test_tokens['tag'].astype(str)
test_tokens.to_csv('test_tokens.csv', index=False)

In [121]:
def tag_using_col(test_tokens, col):
    test_tokens = test_tokens.merge(test_clean2[['id', col]].dropna(), how='left', left_on='sentence_idx', right_on='id')

    if col == 'street':
        # check if word is in street, split(" ") to ensure that only full words are captured and substrings of words are ignored, eg. "a"
        test_tokens['word_in_street'] = np.where(test_tokens[col].notnull(),
                                                  test_tokens.apply(lambda x: str(x.word) in str(x.street).split(" "), axis=1),
                                                  False)
        # tag word as street
        test_tokens['tag'] = np.where(test_tokens['word_in_street'], 'street_cont', test_tokens['tag'])
    else:
        # check if word is in POI, split(" ") to ensure that only full words are captured and substrings of words are ignored, eg. "a"
        test_tokens['word_in_POI'] = np.where(test_tokens[col].notnull(),
                                               test_tokens.apply(lambda x: str(x.word) in str(x.POI).split(" "), axis=1),
                                               False)
        # tag word as POI
        test_tokens['tag'] = np.where(test_tokens['word_in_POI'], 'POI_cont', test_tokens['tag'])
                
    # get first word of target col
    test_tokens[col+'START'] = test_tokens[col].str.split(" ").str[0]
    # assign first word of target col as <target>_start            
    test_tokens['tag'] = np.where(test_tokens['word']==test_tokens[col+'START'], col+'_start', test_tokens['tag'])
    
    # if word is longer than one word, assign an end
    test_tokens[col+'END'] = test_tokens[col].str.split(" ").str[-1]
    test_tokens['tag'] = np.where((test_tokens['word']==test_tokens[col+'END'])&(test_tokens[col].str.count(' ')!=0), col+'_end', test_tokens['tag'])
        
    
    test_tokens = test_tokens.drop(columns=['id', col, col+'START', col+'END', 'word_in_'+col])
    return test_tokens

In [122]:
test_tokens

Unnamed: 0,sentence_idx,word,tag
0,0,s.,
1,0,parman,
2,0,53,
3,0,sidanegara,
4,0,4,
...,...,...,...
381016,49998,no.,
381017,49998,11,
381018,49999,adi,
381019,49999,",",comma


In [123]:
test_clean2

Unnamed: 0,id,raw_address,street,POI,POI_in_raw_address,street_in_raw_address
0,0,s. parman 53 sidanegara 4 cilacap tengah,s. parman 53 sidanegara 4 cilacap tengah,,False,True
1,1,"anggrek per, baloi indah kel. lubuk baja",anggrek per,baloi indah kel. lubuk baja,True,True
2,2,"asma laun, mangund imog,",asma laun,mangund imog,True,True
3,3,"ud agung rejeki, raya ngawi- sri wedari karang...",ud agung rejeki,raya ngawi- sri wedari karanganyar,True,True
4,4,"cut mutia, 35 baiturrahman",cut mutia,35 baiturrahman,True,True
...,...,...,...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid semboro semboro,,False,True
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vie - tk. ridho kids,vete 3 cari,True,True
49997,49997,"mart dan roti bakar malabar, nasio,",mart dan roti bakar malabar,nasio,True,True
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...,graha indah pamulang jl. mujair raya bambanbu ...,,False,True


In [125]:
test_tokens = tokenize_address_by_sep(test_clean2)
test_tokens = tag_using_col(test_tokens, 'POI')
test_tokens = tag_using_col(test_tokens, 'street')
test_tokens = test_tokens[test_tokens['word'].notnull()].reset_index(drop=True)
test_tokens['tag'] = test_tokens['tag'].astype(str)

# ensure that there are no duplicates in sentence_idx of test_tokens in train_tokens
test_tokens['sentence_idx'] = test_tokens['sentence_idx']+len(test)
test_tokens.to_csv('test_tokens.csv', index=False)

In [126]:
test_tokens

Unnamed: 0,sentence_idx,word,tag
0,50000,s.,street_start
1,50000,parman,street_cont
2,50000,53,street_cont
3,50000,sidanegara,street_cont
4,50000,4,street_cont
...,...,...,...
381030,99998,no.,street_cont
381031,99998,11,street_end
381032,99999,adi,street_start
381033,99999,",",comma


In [127]:
test_tokens['tag'].value_counts()

street_cont     154005
street_start     51071
street_end       50466
POI_cont         48239
comma            34382
POI_end          22869
nan              15024
POI_start         4979
Name: tag, dtype: int64

In [128]:
test_tokens['tag'] = test_tokens['tag'].str.split("_").str[0]
tag_seq = test_tokens[test_tokens['tag']!='comma'].drop_duplicates(subset=['sentence_idx', 'tag']).groupby('sentence_idx')['tag'].apply(list).reset_index(name='tag_seq')
tag_seq['tag_seq'] = tag_seq['tag_seq'].astype(str)
tag_seq['tag_seq'].value_counts()

['street']                  24643
['street', 'POI']           19505
['street', 'POI', 'nan']     5816
['street', 'nan']              36
Name: tag_seq, dtype: int64

In [129]:
tag_seq

Unnamed: 0,sentence_idx,tag_seq
0,50000,['street']
1,50001,"['street', 'POI']"
2,50002,"['street', 'POI']"
3,50003,"['street', 'POI']"
4,50004,"['street', 'POI']"
...,...,...
49995,99995,['street']
49996,99996,"['street', 'POI', 'nan']"
49997,99997,"['street', 'POI']"
49998,99998,['street']


In [130]:
test_tokens

Unnamed: 0,sentence_idx,word,tag
0,50000,s.,street
1,50000,parman,street
2,50000,53,street
3,50000,sidanegara,street
4,50000,4,street
...,...,...,...
381030,99998,no.,street
381031,99998,11,street
381032,99999,adi,street
381033,99999,",",comma


# Test2

In [139]:
test_clean3 = pd.read_csv('cleaned_test.csv')
test_clean3

Unnamed: 0,id,raw_address
0,0,s. parman 53 sidanegara 4 cilacap tengah
1,1,"anggrek per, baloi indah kel. lubuk baja"
2,2,"asma laun, mangund imog,"
3,3,"ud agung rejeki, raya ngawi- sri wedari karang..."
4,4,"cut mutia, 35 baiturrahman"
...,...,...
49995,49995,toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi"
49997,49997,"mart dan roti bakar malabar, nasio,"
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...


In [140]:
#split col poi and street
test_clean3['street'] = test_clean2['raw_address'].str.split(",", n = 0, expand = True)[0]
test_clean3['POI'] = test_clean2['raw_address'].str.split(",", n = 0, expand = True)[1]
test_clean3

Unnamed: 0,id,raw_address,street,POI
0,0,s. parman 53 sidanegara 4 cilacap tengah,s. parman 53 sidanegara 4 cilacap tengah,
1,1,"anggrek per, baloi indah kel. lubuk baja",anggrek per,baloi indah kel. lubuk baja
2,2,"asma laun, mangund imog,",asma laun,mangund imog
3,3,"ud agung rejeki, raya ngawi- sri wedari karang...",ud agung rejeki,raya ngawi- sri wedari karanganyar
4,4,"cut mutia, 35 baiturrahman",cut mutia,35 baiturrahman
...,...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid semboro semboro,
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vie - tk. ridho kids,vete 3 cari
49997,49997,"mart dan roti bakar malabar, nasio,",mart dan roti bakar malabar,nasio
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...,graha indah pamulang jl. mujair raya bambanbu ...,


In [141]:
test_clean3.replace(to_replace=[None], value="/", inplace=True)
test_clean3

Unnamed: 0,id,raw_address,street,POI
0,0,s. parman 53 sidanegara 4 cilacap tengah,s. parman 53 sidanegara 4 cilacap tengah,/
1,1,"anggrek per, baloi indah kel. lubuk baja",anggrek per,baloi indah kel. lubuk baja
2,2,"asma laun, mangund imog,",asma laun,mangund imog
3,3,"ud agung rejeki, raya ngawi- sri wedari karang...",ud agung rejeki,raya ngawi- sri wedari karanganyar
4,4,"cut mutia, 35 baiturrahman",cut mutia,35 baiturrahman
...,...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid semboro semboro,/
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi",vie - tk. ridho kids,vete 3 cari
49997,49997,"mart dan roti bakar malabar, nasio,",mart dan roti bakar malabar,nasio
49998,49998,graha indah pamulang jl. mujair raya bambanbu ...,graha indah pamulang jl. mujair raya bambanbu ...,/


In [142]:
data_submit=test_clean3[['id','raw_address','street','POI']]
data_submit['POI/street']=data_submit['POI'].str.cat(data_submit['street'])
data_submit=data_submit[['id','POI/street']]
data_submit

Unnamed: 0,id,POI/street
0,0,/s. parman 53 sidanegara 4 cilacap tengah
1,1,baloi indah kel. lubuk bajaanggrek per
2,2,mangund imog asma laun
3,3,raya ngawi- sri wedari karanganyarud agung re...
4,4,35 baiturrahmancut mutia
...,...,...
49995,49995,/toko mbak farid semboro semboro
49996,49996,vete 3 cari vie - tk. ridho kids
49997,49997,nasio mart dan roti bakar malabar
49998,49998,/graha indah pamulang jl. mujair raya bambanbu...


In [143]:
data_submit.to_csv('data_submit.csv', index=False)