In [1]:
import polars as pl
import re

from collections import Counter 


In [354]:
data = pl.read_csv('data/merchant_location_training.csv', separator=',', quote_char='"')

In [351]:
data = pl.read_csv('data/merchant_locs_train.csv', separator=',', quote_char='"')

In [353]:
data

source,merchant_name,merchant_location,online_offline_flag
str,str,str,str
"""CREDIT_CARD""","""YOG3102 POP MART DR WAHIDYOGYA…",,"""ONLINE"""
"""QRIS_WONDR""","""ANIMO BAKERY""","""TANGERANG""","""OFFLINE"""
"""QRIS_WONDR""","""APOTEK 92 FARMA""","""SURABAYA (KOTA)""","""OFFLINE"""
"""QRIS_WONDR""","""CHATERAISE HIVE KHI""","""BEKASI""","""OFFLINE"""
"""QRIS_WONDR""","""FOTOCOPY ALFATTAH MASTRIP""","""KAB. JEMBER""","""OFFLINE"""
…,…,…,…
"""QRIS_WONDR""","""JUICE KABITA, DASANA INDA""","""TANGERANG""","""OFFLINE"""
"""CREDIT_CARD""","""TRAVELOKA*1202604564 JAKARTA I…",,"""ONLINE"""
"""QRIS_WONDR""","""KEDAI LINGGA""","""SUMEDANG""","""OFFLINE"""
"""QRIS_WONDR""","""MUTIARA MANDIRI COLL""","""MOJOKERTO""","""OFFLINE"""


In [4]:
data['online_offline_flag'].value_counts()

online_offline_flag,count
str,u32
,443739
"""OFFLINE""",1573240


In [4]:
data.filter(pl.col('merchant_location').is_null())

source,merchant_name,merchant_location,online_offline_flag
str,str,str,str
"""CREDIT_CARD""","""CHOP BUNTUT CAK YO SLEMAN (KAB…",,
"""CREDIT_CARD""","""TBL* THE SEED LEVEL AMSTERDAM …",,
"""CREDIT_CARD""","""SILOAM CLINIC GRH RAYA TANGERA…",,
"""CREDIT_CARD""","""SWALAYAN ZOOM MBL TANJUNG PINA…",,
"""CREDIT_CARD""","""PEDRO SUN PLAZA MEDAN MEDAN KO…",,
…,…,…,…
"""CREDIT_CARD""","""AUSSIE TOYS ONLINE PAKENHAM AU""",,
"""CREDIT_CARD""","""TOKO EMAS GINBERS MEDAN KOT. I…",,"""OFFLINE"""
"""CREDIT_CARD""","""BINTANG SPORT SERANG (KOTA)ID""",,
"""CREDIT_CARD""","""ACE HARDWARE MALL PI I JAKARTA…",,


In [5]:
data.filter(pl.col('merchant_location').is_not_null()).sample(10000).sort('merchant_name').write_csv('data_exploration/not_null_merchant_locations.csv')

# CREATE NER LABELS

1. Country Label (ID, JP) ISO 3166-1 alpha-2
2. City Label
3. Area Label
4. Merchant Name

In [4]:
#for merchant with known location, 
#we assume the country code is ID, city label is the merchant_location, and merchant name is the merchant name
data_location_known = data.filter(pl.col("merchant_location").is_not_null()) 
data_location_unknown = data.filter(pl.col("merchant_location").is_null())

In [5]:
# split the merchant name into country, city, merchant_name
# 
# def split_backward(text):
#     splitted = []
#     element = ""
#     split_count = 0
#     for i, c in enumerate(reversed(text)):
#         if split_count == 2:
#             element += c
#             if i == len(text) -1:
#                 splitted.append(element[::-1])
        
#         elif c == " ":
#             splitted.append(element[::-1])
#             element = ""
#             split_count += 1
#         else:
#             element += c

#     return splitted

def get_word_ngram(text, n):
    words = text.split()
    if len(words) < n:
        return []
        
    return [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]

def split_logically(text):

    remainder = text
    # last two characters are country id
    country_id = remainder[len(text)-2] + remainder[len(text)-1]
    remainder = text[:len(text)-2].lower()

    #return country_id, get_word_ngram(remainder.strip(),1)
    return country_id, re.split(r"\s+", re.sub(r'[^a-zA-Z0-9\s]','',remainder).strip())


In [6]:
data_location_unknown = data_location_unknown.with_columns(
    pl.col("merchant_name").map_elements(lambda x: split_logically(x)[0], return_dtype = pl.String).alias("country_code"),
    pl.col("merchant_name").map_elements(lambda x: split_logically(x)[1], return_dtype = pl.List(pl.String)).alias("word_list"),  
)

In [45]:
data_location_unknown.sample(1000).write_json("data_exploration/split_1.json")

In [7]:
data_location_unknown[45]

source,merchant_name,merchant_location,online_offline_flag,country_code,word_list
str,str,str,str,str,list[str]
"""CREDIT_CARD""","""W519 ALFAMART KRUENG M LHOKSEU…",,"""OFFLINE""","""ID""","[""w519"", ""alfamart"", … ""lhokseumawe""]"


In [111]:
re.sub(r'[^a-zA-Z0-9\s]','', "W519 ALFAMART KRUENG M LHOKSEUMAWE (").strip()

'W519 ALFAMART KRUENG M LHOKSEUMAWE'

In [186]:
data_location_unknown['country_code'].value_counts()

country_code,count
str,u32
"""DO""",12
"""YA""",4
"""KN""",5
"""LL""",1
"""I.""",1
…,…
"""SE""",651
"""JP""",17203
"""GP""",1
"""JO""",91


In [380]:
data_location_unknown.filter(pl.col('country_code')== 'ID').sample(1)['word_list']

word_list
list[str]
"[""iboxaasp"", ""dago"", ""bandung""]"


# CREATE DICTIONARY OF INDONESIAN LOCATIONS

In [2]:
cities_regencies = pl.read_csv('data/regencies.csv', new_columns=['id', 'id_provinces', 'name'])

In [3]:
cities_regencies

id,id_provinces,name
i64,i64,str
1102,11,"""KABUPATEN ACEH SINGKIL"""
1103,11,"""KABUPATEN ACEH SELATAN"""
1104,11,"""KABUPATEN ACEH TENGGARA"""
1105,11,"""KABUPATEN ACEH TIMUR"""
1106,11,"""KABUPATEN ACEH TENGAH"""
…,…,…
9433,94,"""KABUPATEN PUNCAK"""
9434,94,"""KABUPATEN DOGIYAI"""
9435,94,"""KABUPATEN INTAN JAYA"""
9436,94,"""KABUPATEN DEIYAI"""


In [4]:
# ngram
def get_word_ngram(text, n):
    words = text.split()
    if len(words) < n:
        return []
        
    return [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]

def get_word_ngram_from_list(wordlist, n):
    if len(wordlist) < n:
        return []
        
    return [tuple(wordlist[i:i+n]) for i in range(len(wordlist) - n + 1)]

def get_char_ngrams(text, n):
    """Generate character n-grams from text."""
    # Convert text to lowercase and remove spaces for character-level comparison
    text = text.lower().replace(" ", "")
    return [text[i:i+n] for i in range(len(text) - n + 1)]

In [5]:
cities_regencies = cities_regencies.with_columns(
    pl.col('name').str.replace(r'(KABUPATEN|KOTA)','').str.strip_chars().alias('name_clean')
)

In [6]:
cities_regencies

id,id_provinces,name,name_clean
i64,i64,str,str
1102,11,"""KABUPATEN ACEH SINGKIL""","""ACEH SINGKIL"""
1103,11,"""KABUPATEN ACEH SELATAN""","""ACEH SELATAN"""
1104,11,"""KABUPATEN ACEH TENGGARA""","""ACEH TENGGARA"""
1105,11,"""KABUPATEN ACEH TIMUR""","""ACEH TIMUR"""
1106,11,"""KABUPATEN ACEH TENGAH""","""ACEH TENGAH"""
…,…,…,…
9433,94,"""KABUPATEN PUNCAK""","""PUNCAK"""
9434,94,"""KABUPATEN DOGIYAI""","""DOGIYAI"""
9435,94,"""KABUPATEN INTAN JAYA""","""INTAN JAYA"""
9436,94,"""KABUPATEN DEIYAI""","""DEIYAI"""


In [7]:
cities_regencies = cities_regencies.with_columns(
    pl.col('name_clean').map_elements(lambda x: get_char_ngrams(x,1), return_dtype=pl.Object).alias('unigram'),
    pl.col('name_clean').map_elements(lambda x: get_char_ngrams(x,2), return_dtype=pl.Object).alias('bigram'),
    pl.col('name_clean').map_elements(lambda x: get_char_ngrams(x,3), return_dtype=pl.Object).alias('trigram'),
)

In [8]:
cities_regencies

id,id_provinces,name,name_clean,unigram,bigram,trigram
i64,i64,str,str,object,object,object
1102,11,"""KABUPATEN ACEH SINGKIL""","""ACEH SINGKIL""","['a', 'c', 'e', 'h', 's', 'i', 'n', 'g', 'k', 'i', 'l']","['ac', 'ce', 'eh', 'hs', 'si', 'in', 'ng', 'gk', 'ki', 'il']","['ace', 'ceh', 'ehs', 'hsi', 'sin', 'ing', 'ngk', 'gki', 'kil']"
1103,11,"""KABUPATEN ACEH SELATAN""","""ACEH SELATAN""","['a', 'c', 'e', 'h', 's', 'e', 'l', 'a', 't', 'a', 'n']","['ac', 'ce', 'eh', 'hs', 'se', 'el', 'la', 'at', 'ta', 'an']","['ace', 'ceh', 'ehs', 'hse', 'sel', 'ela', 'lat', 'ata', 'tan']"
1104,11,"""KABUPATEN ACEH TENGGARA""","""ACEH TENGGARA""","['a', 'c', 'e', 'h', 't', 'e', 'n', 'g', 'g', 'a', 'r', 'a']","['ac', 'ce', 'eh', 'ht', 'te', 'en', 'ng', 'gg', 'ga', 'ar', 'ra']","['ace', 'ceh', 'eht', 'hte', 'ten', 'eng', 'ngg', 'gga', 'gar', 'ara']"
1105,11,"""KABUPATEN ACEH TIMUR""","""ACEH TIMUR""","['a', 'c', 'e', 'h', 't', 'i', 'm', 'u', 'r']","['ac', 'ce', 'eh', 'ht', 'ti', 'im', 'mu', 'ur']","['ace', 'ceh', 'eht', 'hti', 'tim', 'imu', 'mur']"
1106,11,"""KABUPATEN ACEH TENGAH""","""ACEH TENGAH""","['a', 'c', 'e', 'h', 't', 'e', 'n', 'g', 'a', 'h']","['ac', 'ce', 'eh', 'ht', 'te', 'en', 'ng', 'ga', 'ah']","['ace', 'ceh', 'eht', 'hte', 'ten', 'eng', 'nga', 'gah']"
…,…,…,…,…,…,…
9433,94,"""KABUPATEN PUNCAK""","""PUNCAK""","['p', 'u', 'n', 'c', 'a', 'k']","['pu', 'un', 'nc', 'ca', 'ak']","['pun', 'unc', 'nca', 'cak']"
9434,94,"""KABUPATEN DOGIYAI""","""DOGIYAI""","['d', 'o', 'g', 'i', 'y', 'a', 'i']","['do', 'og', 'gi', 'iy', 'ya', 'ai']","['dog', 'ogi', 'giy', 'iya', 'yai']"
9435,94,"""KABUPATEN INTAN JAYA""","""INTAN JAYA""","['i', 'n', 't', 'a', 'n', 'j', 'a', 'y', 'a']","['in', 'nt', 'ta', 'an', 'nj', 'ja', 'ay', 'ya']","['int', 'nta', 'tan', 'anj', 'nja', 'jay', 'aya']"
9436,94,"""KABUPATEN DEIYAI""","""DEIYAI""","['d', 'e', 'i', 'y', 'a', 'i']","['de', 'ei', 'iy', 'ya', 'ai']","['dei', 'eiy', 'iya', 'yai']"


In [15]:
bandung = cities_regencies.filter(pl.col('name') == 'KOTA BANDUNG')['unigram'][0]
bandung

['b', 'a', 'n', 'd', 'u', 'n', 'g']

In [17]:
cities_regencies.filter(pl.col('name') == 'KOTA JAKARTA BARAT')

id,id_provinces,name,name_clean,unigram,bigram,trigram
i64,i64,str,str,object,object,object
3174,31,"""KOTA JAKARTA BARAT""",""" JAKARTA BARAT""","['j', 'a', 'k', 'a', 'r', 't', 'a', 'b', 'a', 'r', 'a', 't']","['ja', 'ak', 'ka', 'ar', 'rt', 'ta', 'ab', 'ba', 'ar', 'ra', 'at']","['jak', 'aka', 'kar', 'art', 'rta', 'tab', 'aba', 'bar', 'ara', 'rat']"


In [291]:
def jaccard_similarity(ngrams1, ngrams2):
    _ngrams1 = set(ngrams1)
    _ngrams2 = set(ngrams2)
    
    intersection = len(_ngrams1.intersection(_ngrams2))
    union = len(_ngrams1.union(_ngrams2))

    return intersection / union
    

In [19]:
merchant_words = data_location_unknown.filter(pl.col('country_code') == 'ID')['word_list'][0].to_list()
merchant_words

['chop', 'buntut', 'cak', 'yo', 'sleman', 'kab']

In [20]:
city_unigram = cities_regencies.filter(pl.col('name').str.contains(r'SLEMAN'))['unigram'][0]
city_unigram

['s', 'l', 'e', 'm', 'a', 'n']

In [21]:
city_bigram = cities_regencies.filter(pl.col('name').str.contains(r'SLEMAN'))['bigram'][0]
city_bigram

['sl', 'le', 'em', 'ma', 'an']

In [17]:
city_trigram = cities_regencies.filter(pl.col('name').str.contains(r'SLEMAN'))['trigram'][0]
city_trigram

['kab',
 'abu',
 'bup',
 'upa',
 'pat',
 'ate',
 'ten',
 'ens',
 'nsl',
 'sle',
 'lem',
 'ema',
 'man']

In [15]:
def get_word_ngram_from_list(wordlist, n):
    if len(wordlist) < n:
        return []
    return [tuple(wordlist[i:i+n]) for i in range(len(wordlist) - n + 1)]

# def get_word_ngram_from_list(wordlist, n):
#     if len(wordlist) < n:
#         return []

#     result = []
#     for i in range(len(wordlist) - n + 1):
#         word = []
#         for j in range(i,i+n):
#             word.append((wordlist[j],j))
#         result.append(word)
        
#     return result

In [23]:
get_word_ngram_from_list(merchant_words,2)

[('chop', 'buntut'),
 ('buntut', 'cak'),
 ('cak', 'yo'),
 ('yo', 'sleman'),
 ('sleman', 'kab')]

In [20]:
merchant_unigram = get_word_ngram_from_list(merchant_words,1)
merchant_unigram

[('chop',), ('buntut',), ('cak',), ('yo',), ('sleman',), ('kab',)]

In [23]:
# unigram matching
unigram_threshold = 0.5
word_matches = []

for i, word in enumerate(merchant_unigram):
    best_match = None
    best_score = 0

    string_word = " ".join(word)
    word_char = get_char_ngrams(string_word,1)
    
    score = jaccard_similarity(city_unigram, word_char)
    if score > best_score:
        best_score = score
    
    is_location = best_score >= unigram_threshold
    match_info = {
            'word': word,
            'is_location': is_location,
            'score': best_score,
            'index_interval': [i,i+len(word)-1]
        }
    word_matches.append(match_info)
    #results['labeled_words'].append((word, is_location, best_match, best_score))

In [24]:
word_matches

[{'word': ('chop',),
  'is_location': False,
  'score': 0.07142857142857142,
  'index_interval': [0, 0]},
 {'word': ('buntut',),
  'is_location': False,
  'score': 0.36363636363636365,
  'index_interval': [1, 1]},
 {'word': ('cak',),
  'is_location': False,
  'score': 0.16666666666666666,
  'index_interval': [2, 2]},
 {'word': ('yo',), 'is_location': False, 'score': 0, 'index_interval': [3, 3]},
 {'word': ('sleman',),
  'is_location': True,
  'score': 0.5454545454545454,
  'index_interval': [4, 4]},
 {'word': ('kab',),
  'is_location': False,
  'score': 0.2727272727272727,
  'index_interval': [5, 5]}]

In [25]:
merchant_list = get_word_ngram_from_list(merchant_words, 2)
merchant_list

[('chop', 'buntut'),
 ('buntut', 'cak'),
 ('cak', 'yo'),
 ('yo', 'sleman'),
 ('sleman', 'kab')]

In [26]:
merchant_list[4]

('sleman', 'kab')

In [27]:
city_bigram

['ka',
 'ab',
 'bu',
 'up',
 'pa',
 'at',
 'te',
 'en',
 'ns',
 'sl',
 'le',
 'em',
 'ma',
 'an']

In [38]:
# bigram matching
bigram_threshold = 0.4
word_matches = []

merchant_bigram = get_word_ngram_from_list(merchant_words, 2)

In [39]:
for i, word in enumerate(merchant_bigram):
    best_match = None
    best_score = 0

    string_word = " ".join(word)
    
    word_char = get_char_ngrams(string_word,2)
    
    score = jaccard_similarity(city_bigram, word_char)
    if score > best_score:
        best_score = score
    
    is_location = best_score >= bigram_threshold
    match_info = {
            'word': word,
            'is_location': is_location,
            'score': best_score,
            'index_interval': [i, i+len(word)-1]
        }
    word_matches.append(match_info)
    #results['labeled_words'].append((word, is_location, best_match, best_score))

In [40]:
word_matches

[{'word': ('chop', 'buntut'),
  'is_location': False,
  'score': 0.045454545454545456,
  'index_interval': [0, 1]},
 {'word': ('buntut', 'cak'),
  'is_location': False,
  'score': 0.047619047619047616,
  'index_interval': [1, 2]},
 {'word': ('cak', 'yo'),
  'is_location': False,
  'score': 0,
  'index_interval': [2, 3]},
 {'word': ('yo', 'sleman'),
  'is_location': False,
  'score': 0.3125,
  'index_interval': [3, 4]},
 {'word': ('sleman', 'kab'),
  'is_location': True,
  'score': 0.4666666666666667,
  'index_interval': [4, 5]}]

In [342]:
city_trigram

['kab',
 'abu',
 'bup',
 'upa',
 'pat',
 'ate',
 'ten',
 'ens',
 'nsl',
 'sle',
 'lem',
 'ema',
 'man']

In [42]:
# trigram matching

trigram_threshold = 0.4
word_matches = []

merchant_trigram = get_word_ngram_from_list(merchant_words, 3)
for i, word in enumerate(merchant_trigram):
    best_match = None
    best_score = 0

    string_word = "".join(word)
    
    word_char = get_char_ngrams(string_word,1)
    
    score = jaccard_similarity(city_unigram, word_char)
    if score > best_score:
        best_score = score
    
    is_location = best_score >= trigram_threshold
    match_info = {
            'word': word,
            'is_location': is_location,
            'score': best_score,
            'index_interval': [i, i+len(word)-1]
        }
    word_matches.append(match_info)
    #results['labeled_words'].append((word, is_location, best_match, best_score))

In [349]:
word_matches

{'unigram': [(0.8333333333333334,
   {'word': ('samsung',),
    'score': 0.8333333333333334,
    'index_interval': [0, 0]}),
  (0.75, {'word': ('samsung',), 'score': 0.75, 'index_interval': [0, 0]}),
  (0.75, {'word': ('samsung',), 'score': 0.75, 'index_interval': [0, 0]}),
  (0.7142857142857143,
   {'word': ('samsung',),
    'score': 0.7142857142857143,
    'index_interval': [0, 0]}),
  (0.625, {'word': ('samsung',), 'score': 0.625, 'index_interval': [0, 0]}),
  (0.625, {'word': ('samsung',), 'score': 0.625, 'index_interval': [0, 0]}),
  (0.7142857142857143,
   {'word': ('samsung',),
    'score': 0.7142857142857143,
    'index_interval': [0, 0]}),
  (0.6, {'word': ('samsung',), 'score': 0.6, 'index_interval': [0, 0]}),
  (0.8333333333333334,
   {'word': ('samsung',),
    'score': 0.8333333333333334,
    'index_interval': [0, 0]}),
  (0.6923076923076923,
   {'word': ('metrobekasi',),
    'score': 0.6923076923076923,
    'index_interval': [2, 2]})],
 'bigram': [],
 'trigram': []}

# LOOP TO ALL CITY DICTIONARY

In [292]:
merchant_words = data_location_unknown['word_list'][9997].to_list()
merchant_words

['samsung', 'rstore', 'metrobekasi']

In [9]:
city_dictionary = {
    'unigram':{},
    'bigram':{},
}

In [10]:
for unigram in cities_regencies['unigram']:
    if unigram[0] not in city_dictionary['unigram']:
        unigram_list = [unigram]
        city_dictionary['unigram'][unigram[0]] = unigram_list
    else:
        city_dictionary['unigram'][unigram[0]].append(unigram)

In [11]:
for bigram in cities_regencies['bigram']:
    if bigram[0][0] not in city_dictionary['bigram']:
        bigram_list = [bigram]
        city_dictionary['bigram'][bigram[0][0]] = bigram_list
    else:
        city_dictionary['bigram'][bigram[0][0]].append(bigram)

In [314]:
unigram_threshold = 0.81
merchant_unigram = get_word_ngram_from_list(merchant_words,1)

In [315]:
merchant_unigram[0][0][0]

's'

In [280]:
word_matches = {'unigram':[], 'bigram':[], 'trigram':[]}

In [282]:
%%time
unigram_threshold = 0.6
merchant_unigram = get_word_ngram_from_list(merchant_words,1)

# unigram matching
for i, word in enumerate(merchant_unigram):
    first_char = word[0][0]

    if first_char not in city_dictionary['unigram']:
         continue
        
    for city_unigram in city_dictionary['unigram'][first_char]:
    #for city_unigram in cities_regencies['unigram']:
        
        best_match = None
        best_score = 0
    
        string_word = " ".join(word)
        word_char = get_char_ngrams(string_word,1)
        
        score = jaccard_similarity(city_unigram, word_char)
        if score > best_score:
            best_score = score
        
        is_location = best_score >= unigram_threshold
        match_info = (
            best_score,
                {
                'word': word,
                'score': best_score,
                'index_interval': [i,i+len(word)-1]
                }
        )
        if is_location:
            print(city_unigram)
            word_matches['unigram'].append(match_info)

['s', 'i', 'm', 'a', 'l', 'u', 'n', 'g', 'u', 'n']
['s', 'u', 'm', 'e', 'd', 'a', 'n', 'g']
['s', 'u', 'b', 'a', 'n', 'g']
['s', 'e', 'm', 'a', 'r', 'a', 'n', 'g']
['s', 'e', 'm', 'a', 'r', 'a', 'n', 'g']
['s', 'a', 'm', 'p', 'a', 'n', 'g']
['s', 'u', 'm', 'b', 'a', 't', 'e', 'n', 'g', 'a', 'h']
['s', 'a', 'n', 'g', 'g', 'a', 'u']
['m', 'a', 'n', 'o', 'k', 'w', 'a', 'r', 'i', 's', 'e', 'l', 'a', 't', 'a', 'n']
CPU times: user 601 μs, sys: 7 μs, total: 608 μs
Wall time: 595 μs


In [283]:
word_matches

{'unigram': [(0.8333333333333334,
   {'word': ('samsung',),
    'score': 0.8333333333333334,
    'index_interval': [0, 0]}),
  (0.75, {'word': ('samsung',), 'score': 0.75, 'index_interval': [0, 0]}),
  (0.75, {'word': ('samsung',), 'score': 0.75, 'index_interval': [0, 0]}),
  (0.7142857142857143,
   {'word': ('samsung',),
    'score': 0.7142857142857143,
    'index_interval': [0, 0]}),
  (0.625, {'word': ('samsung',), 'score': 0.625, 'index_interval': [0, 0]}),
  (0.625, {'word': ('samsung',), 'score': 0.625, 'index_interval': [0, 0]}),
  (0.7142857142857143,
   {'word': ('samsung',),
    'score': 0.7142857142857143,
    'index_interval': [0, 0]}),
  (0.6, {'word': ('samsung',), 'score': 0.6, 'index_interval': [0, 0]}),
  (0.8333333333333334,
   {'word': ('samsung',),
    'score': 0.8333333333333334,
    'index_interval': [0, 0]}),
  (0.6923076923076923,
   {'word': ('metrobekasi',),
    'score': 0.6923076923076923,
    'index_interval': [2, 2]})],
 'bigram': [],
 'trigram': []}

In [269]:
cities_regencies['bigram'][0][0][0]

'a'

In [270]:
bigram_threshold = 0.55
merchant_bigram = get_word_ngram_from_list(merchant_words,2)

In [271]:
merchant_bigram

[('samsung', 'rstore'), ('rstore', 'metrobekasi')]

In [272]:
%%time
#word_matches = {'unigram':[], 'bigram':[], 'trigram':[]}
bigram_threshold = 0.65
merchant_bigram = get_word_ngram_from_list(merchant_words,2)

for i, word in enumerate(merchant_bigram):
    first_chars = (word[0][0], word[1][0])

    for first_char in first_chars:
        if first_char not in city_dictionary['bigram']:
            continue
        
        for city_bigram in city_dictionary['bigram'][first_char]:
            best_match = None
            best_score = 0
        
            string_word = " ".join(word)
            word_char = get_char_ngrams(string_word,2)
            
            score = jaccard_similarity(city_bigram, word_char)
            if score > best_score:
                best_score = score
            
            is_location = best_score >= bigram_threshold
            match_info = {
                    'word': word,
                    'score': best_score,
                    'index_interval': [i,i+len(word)-1]
                }
            if is_location:
                print(city_bigram)
                word_matches['bigram'].append(match_info)

CPU times: user 850 μs, sys: 0 ns, total: 850 μs
Wall time: 854 μs


In [273]:
word_matches

{'unigram': [(0.8333333333333334,
   {'word': ('samsung',),
    'score': 0.8333333333333334,
    'index_interval': [0, 0]})],
 'bigram': [],
 'trigram': []}

In [229]:
#word_matches = {'unigram':[], 'bigram':[], 'trigram':[]}
trigram_threshold = 0.7
merchant_trigram = get_word_ngram_from_list(merchant_words,3)

for city_trigram in cities_regencies['trigram']:
    # trigram matching
    for i, word in enumerate(merchant_trigram):
        best_match = None
        best_score = 0
    
        string_word = " ".join(word)
        word_char = get_char_ngrams(string_word,3)
        
        score = jaccard_similarity(city_trigram, word_char)
        if score > best_score:
            best_score = score
        
        is_location = best_score >= trigram_threshold
        match_info = {
                'word': word,
                'is_location': is_location,
                'score': best_score,
                'index_interval': [i,i+len(word)-1]
            }
        if is_location:
            print(city_trigram)
            word_matches['trigram'].append(match_info)

In [230]:
get_word_ngram_from_list(merchant_words,3)

[('alfamrt', 'x761', 'pr2'), ('x761', 'pr2', 'pand'), ('pr2', 'pand', 'bogor')]

In [231]:
word_matches

{'unigram': [{'word': ('bogor',),
   'is_location': True,
   'score': 1.0,
   'index_interval': [4, 4]},
  {'word': ('bogor',),
   'is_location': True,
   'score': 1.0,
   'index_interval': [4, 4]}],
 'bigram': [],
 'trigram': []}

# FIND & EXTRACT LOCATION INDEX

In [175]:
def find_location_index(merchant_words, 
                        cities_regencies, 
                        unigram_threshold=0.9,
                        bigram_threshold=0.65,
                       ):
    word_matches = {'unigram':[], 'bigram':[]}
    
    # unigram matching
    #unigram_threshold = 0.81
    merchant_unigram = get_word_ngram_from_list(merchant_words,1)

    for city_unigram in cities_regencies['unigram']:
        for i, word in enumerate(merchant_unigram):
            best_match = None
            best_score = 0
        
            string_word = " ".join(word)
            word_char = get_char_ngrams(string_word,1)
            
            score = jaccard_similarity(city_unigram, word_char)
            if score > best_score:
                best_score = score


            is_location = best_score >= unigram_threshold
            match_info = (
                best_score,
                    {
                    'word': word,
                    'score': best_score,
                    'index_interval': [i,i+len(word)-1]
                    }
            )
            if is_location:
                word_matches['unigram'].append(match_info)

    # bigram matching 
    merchant_bigram = get_word_ngram_from_list(merchant_words,2)
    for city_bigram in cities_regencies['bigram']:
        for i, word in enumerate(merchant_bigram):
            best_match = None
            best_score = 0
        
            string_word = " ".join(word)
            word_char = get_char_ngrams(string_word,2)
            
            score = jaccard_similarity(city_bigram, word_char)
            if score > best_score:
                best_score = score
            
            is_location = best_score >= bigram_threshold
            match_info = (
                best_score,
                    {
                    'word': word,
                    'score': best_score,
                    'index_interval': [i,i+len(word)-1]
                    }
            )
            if is_location:
                word_matches['bigram'].append(match_info)    

    return word_matches

In [22]:
# WITH DICTIONARY
def find_location_index(merchant_words, 
                        city_dict, 
                        unigram_threshold=0.9,
                        bigram_threshold=0.67,
                       ):
    word_matches = {'unigram':[], 'bigram':[]}
    
    # unigram matching
    #unigram_threshold = 0.81
    merchant_unigram = get_word_ngram_from_list(merchant_words,1)
    for i, word in enumerate(merchant_unigram):   
        if len(word) == 0:
            continue
            
        if len(word[0]) > 0:
            first_char = word[0][0]
        else:
            continue
            
        if first_char not in city_dict['unigram']:
             continue
            
        for city_unigram in city_dict['unigram'][first_char]:
            best_match = None
            best_score = 0
        
            string_word = " ".join(word)
            word_char = get_char_ngrams(string_word,1)
            
            score = jaccard_similarity(city_unigram, word_char)
            if score > best_score:
                best_score = score
            
            is_location = best_score >= unigram_threshold
            match_info = (
                best_score,
                    {
                    'word': word,
                    'score': best_score,
                    'index_interval': [i,i+len(word)-1]
                    }
            )
            if is_location:
                word_matches['unigram'].append(match_info)

    # bigram matching 
    merchant_bigram = get_word_ngram_from_list(merchant_words,2)
    for i, word in enumerate(merchant_bigram):
        first_chars = []
        
        for w in word:
            if len(w) == 0:
                continue
            if len(w[0]) > 0:
                first_chars.append(w[0])
                
        #first_chars = (word[0][0], word[1][0])
    
        for first_char in first_chars:
            if first_char not in city_dict['bigram']:
                continue
            
            for city_bigram in city_dict['bigram'][first_char]:
                best_match = None
                best_score = 0
            
                string_word = " ".join(word)
                word_char = get_char_ngrams(string_word,2)
                
                score = jaccard_similarity(city_bigram, word_char)
                if score > best_score:
                    best_score = score
                
                is_location = best_score >= bigram_threshold
                match_info = (
                    best_score,
                        {
                        'word': word,
                        'score': best_score,
                        'index_interval': [i,i+len(word)-1]
                        }
                )
                if is_location:
                    word_matches['bigram'].append(match_info) 

    return word_matches

In [23]:
merchant_words = data_location_unknown['word_list'][9997].to_list()
merchant_words

['samsung', 'rstore', 'metrobekasi']

In [24]:
%%time
result = find_location_index(merchant_words,
                             city_dictionary
                            )

NameError: name 'city_dictionary' is not defined

In [190]:
result

{'unigram': [], 'bigram': []}

In [191]:
sorted(result['bigram'], key= lambda x: x[0], reverse=True)

[]

In [192]:
def extract_location_index(word_matches):
    unigram_sorted = sorted(word_matches['unigram'], key=lambda x: x[0], reverse=True)
    bigram_sorted = sorted(word_matches['bigram'], key=lambda x: x[0], reverse=True)

    result = [-1,-1]
    unigram_index = set()
    bigram_index = set()
    
    if len(unigram_sorted) > 0:
        unigram_index = set(unigram_sorted[0][1]['index_interval'])

    if len(bigram_sorted) > 0:
        bigram_index = set(bigram_sorted[0][1]['index_interval'])

    result[0] = min(unigram_index.union(bigram_index), default=-1)
    result[1] = max(unigram_index.union(bigram_index), default=-1)
    
    return result

In [193]:
extract_location_index(result)

[-1, -1]

In [194]:
def process_loc_index_on_dataframe(merchant_words, cities_regencies):
    result = find_location_index(merchant_words,cities_regencies)
    return extract_location_index(result)

In [197]:
%%time
data_location_unknown = data_location_unknown.with_columns(
    pl.when(pl.col('country_code') == 'ID').then(
        pl.col('word_list').map_elements(
            lambda word: process_loc_index_on_dataframe(word, city_dictionary), 
            return_dtype = pl.List(pl.Int32)
        )
    ).otherwise(pl.lit(None)).alias('location_index')
)

CPU times: user 13min 35s, sys: 9.33 s, total: 13min 44s
Wall time: 13min 25s


In [198]:
data_location_unknown

source,merchant_name,merchant_location,online_offline_flag,country_code,word_list,location_index
str,str,str,str,str,list[str],list[i32]
"""CREDIT_CARD""","""CHOP BUNTUT CAK YO SLEMAN (KAB…",,,"""ID""","[""chop"", ""buntut"", … ""kab""]","[3, 4]"
"""CREDIT_CARD""","""TBL* THE SEED LEVEL AMSTERDAM …",,,"""NL""","[""tbl"", ""the"", … ""amsterdam""]",
"""CREDIT_CARD""","""SILOAM CLINIC GRH RAYA TANGERA…",,,"""ID""","[""siloam"", ""clinic"", … ""slt""]","[4, 4]"
"""CREDIT_CARD""","""SWALAYAN ZOOM MBL TANJUNG PINA…",,,"""ID""","[""swalayan"", ""zoom"", … ""pinan""]","[3, 4]"
"""CREDIT_CARD""","""PEDRO SUN PLAZA MEDAN MEDAN KO…",,,"""ID""","[""pedro"", ""sun"", … ""kot""]","[3, 4]"
…,…,…,…,…,…,…
"""CREDIT_CARD""","""AUSSIE TOYS ONLINE PAKENHAM AU""",,,"""AU""","[""aussie"", ""toys"", … ""pakenham""]",
"""CREDIT_CARD""","""TOKO EMAS GINBERS MEDAN KOT. I…",,"""OFFLINE""","""ID""","[""toko"", ""emas"", … ""kot""]","[3, 3]"
"""CREDIT_CARD""","""BINTANG SPORT SERANG (KOTA)ID""",,,"""ID""","[""bintang"", ""sport"", … ""kota""]","[2, 2]"
"""CREDIT_CARD""","""ACE HARDWARE MALL PI I JAKARTA…",,,"""ID""","[""ace"", ""hardware"", … ""jakarta""]","[-1, -1]"


In [200]:
data_location_unknown.write_parquet('data_location_labeled.parquet')

# LOAD LABELED LOCATION FROM FILE

In [16]:
data_labeled = pl.read_parquet('data_location_labeled.parquet')

In [17]:
data_labeled

source,merchant_name,merchant_location,online_offline_flag,country_code,word_list,location_index,extracted_location
str,str,str,str,str,list[str],list[i32],str
"""CREDIT_CARD""","""CHOP BUNTUT CAK YO SLEMAN (KAB…",,,"""ID""","[""chop"", ""buntut"", … ""kab""]","[3, 4]","""yo sleman"""
"""CREDIT_CARD""","""TBL* THE SEED LEVEL AMSTERDAM …",,,"""NL""","[""tbl"", ""the"", … ""amsterdam""]",,
"""CREDIT_CARD""","""SILOAM CLINIC GRH RAYA TANGERA…",,,"""ID""","[""siloam"", ""clinic"", … ""slt""]","[4, 4]","""tangerang"""
"""CREDIT_CARD""","""SWALAYAN ZOOM MBL TANJUNG PINA…",,,"""ID""","[""swalayan"", ""zoom"", … ""pinan""]","[3, 4]","""tanjung pinan"""
"""CREDIT_CARD""","""PEDRO SUN PLAZA MEDAN MEDAN KO…",,,"""ID""","[""pedro"", ""sun"", … ""kot""]","[3, 4]","""medan medan"""
…,…,…,…,…,…,…,…
"""CREDIT_CARD""","""AUSSIE TOYS ONLINE PAKENHAM AU""",,,"""AU""","[""aussie"", ""toys"", … ""pakenham""]",,
"""CREDIT_CARD""","""TOKO EMAS GINBERS MEDAN KOT. I…",,"""OFFLINE""","""ID""","[""toko"", ""emas"", … ""kot""]","[3, 3]","""medan"""
"""CREDIT_CARD""","""BINTANG SPORT SERANG (KOTA)ID""",,,"""ID""","[""bintang"", ""sport"", … ""kota""]","[2, 2]","""serang"""
"""CREDIT_CARD""","""ACE HARDWARE MALL PI I JAKARTA…",,,"""ID""","[""ace"", ""hardware"", … ""jakarta""]","[-1, -1]",


In [18]:
sample = data_labeled.filter(pl.col('country_code') == 'ID').tail(5)

In [19]:
sample.filter(pl.col('merchant_name') == 'ACE HARDWARE MALL PI I JAKARTA ID')['word_list']

word_list
list[str]
"[""ace"", ""hardware"", … ""jakarta""]"


In [30]:
merchant_words = sample.filter(pl.col('merchant_name') == "ACE HARDWARE MALL PI I JAKARTA ID")['word_list'].to_list()[0]
merchant_words

['ace', 'hardware', 'mall', 'pi', 'i', 'jakarta']

In [31]:
%%time
find_location_index(merchant_words,
                             city_dictionary,
                             unigram_threshold = 0.8,
                             bigram_threshold = 0.7
                            )

CPU times: user 1.43 ms, sys: 0 ns, total: 1.43 ms
Wall time: 1.44 ms


{'unigram': [(0.8333333333333334,
   {'word': ('jakarta',),
    'score': 0.8333333333333334,
    'index_interval': [5, 5]}),
  (0.8333333333333334,
   {'word': ('jakarta',),
    'score': 0.8333333333333334,
    'index_interval': [5, 5]})],
 'bigram': []}

In [298]:
x = [1,2,3,4,5]

In [310]:
x[3:4+1]

[4, 5]

In [302]:
merchant_words = sample['word_list'][4]
merchant_words

"""idm"""
"""tjaj"""
"""lombok"""
"""timur"""


In [32]:
def extract_location(row):
    word_list = row["word_list"]
    loc_index = row["location_index"]

    if loc_index is None or loc_index == [-1,-1]:
        return None
    
    loc_index_start = loc_index[0]
    loc_index_end = loc_index[1]
        
    return " ".join(word_list[loc_index_start:loc_index_end+1])

In [33]:
data_labeled = data_labeled.with_columns(
    pl.struct(["word_list","location_index"]).map_elements(extract_location, return_dtype = pl.String).alias("extracted_location")
)

In [34]:
data_labeled

source,merchant_name,merchant_location,online_offline_flag,country_code,word_list,location_index,extracted_location
str,str,str,str,str,list[str],list[i32],str
"""CREDIT_CARD""","""CHOP BUNTUT CAK YO SLEMAN (KAB…",,,"""ID""","[""chop"", ""buntut"", … ""kab""]","[3, 4]","""yo sleman"""
"""CREDIT_CARD""","""TBL* THE SEED LEVEL AMSTERDAM …",,,"""NL""","[""tbl"", ""the"", … ""amsterdam""]",,
"""CREDIT_CARD""","""SILOAM CLINIC GRH RAYA TANGERA…",,,"""ID""","[""siloam"", ""clinic"", … ""slt""]","[4, 4]","""tangerang"""
"""CREDIT_CARD""","""SWALAYAN ZOOM MBL TANJUNG PINA…",,,"""ID""","[""swalayan"", ""zoom"", … ""pinan""]","[3, 4]","""tanjung pinan"""
"""CREDIT_CARD""","""PEDRO SUN PLAZA MEDAN MEDAN KO…",,,"""ID""","[""pedro"", ""sun"", … ""kot""]","[3, 4]","""medan medan"""
…,…,…,…,…,…,…,…
"""CREDIT_CARD""","""AUSSIE TOYS ONLINE PAKENHAM AU""",,,"""AU""","[""aussie"", ""toys"", … ""pakenham""]",,
"""CREDIT_CARD""","""TOKO EMAS GINBERS MEDAN KOT. I…",,"""OFFLINE""","""ID""","[""toko"", ""emas"", … ""kot""]","[3, 3]","""medan"""
"""CREDIT_CARD""","""BINTANG SPORT SERANG (KOTA)ID""",,,"""ID""","[""bintang"", ""sport"", … ""kota""]","[2, 2]","""serang"""
"""CREDIT_CARD""","""ACE HARDWARE MALL PI I JAKARTA…",,,"""ID""","[""ace"", ""hardware"", … ""jakarta""]","[-1, -1]",


In [337]:
# export data
data_labeled.write_parquet('data_location_labeled.parquet')

In [525]:
data_labeled

source,merchant_name,merchant_location,online_offline_flag,country_code,word_list,location_index,extracted_location,enhanced_location,cleaned_location
str,str,str,str,str,list[str],list[i32],str,str,str
"""CREDIT_CARD""","""CHOP BUNTUT CAK YO SLEMAN (KAB…",,,"""ID""","[""chop"", ""buntut"", … ""kab""]","[3, 4]","""yo sleman""","""sleman""","""sleman"""
"""CREDIT_CARD""","""TBL* THE SEED LEVEL AMSTERDAM …",,,"""NL""","[""tbl"", ""the"", … ""amsterdam""]",,,,
"""CREDIT_CARD""","""SILOAM CLINIC GRH RAYA TANGERA…",,,"""ID""","[""siloam"", ""clinic"", … ""slt""]","[4, 4]","""tangerang""","""tangerang""","""tangerang"""
"""CREDIT_CARD""","""SWALAYAN ZOOM MBL TANJUNG PINA…",,,"""ID""","[""swalayan"", ""zoom"", … ""pinan""]","[3, 4]","""tanjung pinan""","""tanjung pinang""","""tanjung pinan"""
"""CREDIT_CARD""","""PEDRO SUN PLAZA MEDAN MEDAN KO…",,,"""ID""","[""pedro"", ""sun"", … ""kot""]","[3, 4]","""medan medan""","""medan""","""medan"""
…,…,…,…,…,…,…,…,…,…
"""CREDIT_CARD""","""AUSSIE TOYS ONLINE PAKENHAM AU""",,,"""AU""","[""aussie"", ""toys"", … ""pakenham""]",,,,
"""CREDIT_CARD""","""TOKO EMAS GINBERS MEDAN KOT. I…",,"""OFFLINE""","""ID""","[""toko"", ""emas"", … ""kot""]","[3, 3]","""medan""","""medan""","""medan"""
"""CREDIT_CARD""","""BINTANG SPORT SERANG (KOTA)ID""",,,"""ID""","[""bintang"", ""sport"", … ""kota""]","[2, 2]","""serang""","""serang""","""serang"""
"""CREDIT_CARD""","""ACE HARDWARE MALL PI I JAKARTA…",,,"""ID""","[""ace"", ""hardware"", … ""jakarta""]","[-1, -1]",,,


In [346]:
data_labeled.filter(pl.col('extracted_location').str.contains(r'jakarta sel'))

source,merchant_name,merchant_location,online_offline_flag,country_code,word_list,location_index,extracted_location
str,str,str,str,str,list[str],list[i32],str
"""CREDIT_CARD""","""SUSHI HIRO PASIFIC PLA JAJAKAR…",,,"""ID""","[""sushi"", ""hiro"", … ""selat""]","[4, 5]","""jajakarta selat"""
"""CREDIT_CARD""","""KYZN KUNINGAN MBL 1 JAKARTA SE…",,,"""ID""","[""kyzn"", ""kuningan"", … ""selat""]","[1, 5]","""kuningan mbl 1 jakarta selat"""
"""CREDIT_CARD""","""IRWAN TEAM JAKARTA SELATID""",,,"""ID""","[""irwan"", ""team"", … ""selat""]","[2, 3]","""jakarta selat"""
"""CREDIT_CARD""","""KEBULI BY KHAN CAB.FAT JAKARTA…",,,"""ID""","[""kebuli"", ""by"", … ""selat""]","[4, 5]","""jakarta selat"""
"""CREDIT_CARD""","""HARDWARE BG JAKARTA SELATID""",,,"""ID""","[""hardware"", ""bg"", … ""selat""]","[2, 3]","""jakarta selat"""
…,…,…,…,…,…,…,…
"""CREDIT_CARD""","""DAPUR SATE MAK SYUKUR JAKARTA …",,,"""ID""","[""dapur"", ""sate"", … ""selat""]","[4, 5]","""jakarta selat"""
"""CREDIT_CARD""","""GUESS - KOTA KASABLAN JAKARTA …",,,"""ID""","[""guess"", ""kota"", … ""sela""]","[3, 4]","""jakarta sela"""
"""CREDIT_CARD""","""SUCK MY DUCK JAKARTA SELATID""",,,"""ID""","[""suck"", ""my"", … ""selat""]","[3, 4]","""jakarta selat"""
"""CREDIT_CARD""","""INDRI COLLECTION JAKARTA SELAT…",,,"""ID""","[""indri"", ""collection"", … ""selat""]","[2, 3]","""jakarta selat"""


# TEXT ADJUSTMENT (COMPLETION AND DELETION) TO ENHANCE THE LABEL

In [316]:
# Description preprocessing (Training pipeline)
# 1. Extract and remove last 2 characters (Country code)
# 2. Remove non-alphanumeric
# 3. Remove extra spaces (should be just 1 space)
# 4. Enhance extracted location (completion & deletion)
# 5. Put the enhanced location back to description
# 6. Extract location starting & ending index
# DONE - Go to Training

# Description preprocessing (Inference pipeline)
# 1. Extract and remove last 2 characters (Country code)
# 2. Remove non-alphanumeric
# 3. Remove extra spaces (should be just 1 space)
# DONE - Go to inference

# Postprocessing (Inference pipeline)
# 1. Enhance extracted location (completion & deletion)
data_labeled

source,merchant_name,merchant_location,online_offline_flag,country_code,word_list,location_index,extracted_location
str,str,str,str,str,list[str],list[i32],str
"""CREDIT_CARD""","""CHOP BUNTUT CAK YO SLEMAN (KAB…",,,"""ID""","[""chop"", ""buntut"", … ""kab""]","[3, 4]","""yo sleman"""
"""CREDIT_CARD""","""TBL* THE SEED LEVEL AMSTERDAM …",,,"""NL""","[""tbl"", ""the"", … ""amsterdam""]",,
"""CREDIT_CARD""","""SILOAM CLINIC GRH RAYA TANGERA…",,,"""ID""","[""siloam"", ""clinic"", … ""slt""]","[4, 4]","""tangerang"""
"""CREDIT_CARD""","""SWALAYAN ZOOM MBL TANJUNG PINA…",,,"""ID""","[""swalayan"", ""zoom"", … ""pinan""]","[3, 4]","""tanjung pinan"""
"""CREDIT_CARD""","""PEDRO SUN PLAZA MEDAN MEDAN KO…",,,"""ID""","[""pedro"", ""sun"", … ""kot""]","[3, 4]","""medan medan"""
…,…,…,…,…,…,…,…
"""CREDIT_CARD""","""AUSSIE TOYS ONLINE PAKENHAM AU""",,,"""AU""","[""aussie"", ""toys"", … ""pakenham""]",,
"""CREDIT_CARD""","""TOKO EMAS GINBERS MEDAN KOT. I…",,"""OFFLINE""","""ID""","[""toko"", ""emas"", … ""kot""]","[3, 3]","""medan"""
"""CREDIT_CARD""","""BINTANG SPORT SERANG (KOTA)ID""",,,"""ID""","[""bintang"", ""sport"", … ""kota""]","[2, 2]","""serang"""
"""CREDIT_CARD""","""ACE HARDWARE MALL PI I JAKARTA…",,,"""ID""","[""ace"", ""hardware"", … ""jakarta""]","[-1, -1]",


In [12]:
city_dictionary['full_text'] = {}
for city_name in cities_regencies['name_clean']:
    city_name = city_name.strip().lower()
    first_char = city_name[0]
    if first_char not in city_dictionary['full_text']:
        city_dictionary['full_text'][first_char] = [city_name]
    else:
        city_dictionary['full_text'][first_char].append(city_name)


In [14]:
import pickle

In [15]:
file_path=open("data/city_dictionary.pkl", "wb")
pickle.dump(city_dictionary,file_path)
file_path.close()

In [204]:
def edit_distance(str1, str2):
    """
    Calculate the Levenshtein (edit) distance between two strings.
    
    The edit distance is the minimum number of single-character operations
    (insertions, deletions, or substitutions) required to change one string
    into another.
    
    Args:
        str1: First string
        str2: Second string
        
    Returns:
        The edit distance between str1 and str2
    """
    # Create a matrix of size (len(str1)+1) x (len(str2)+1)
    dp = [[0 for _ in range(len(str2) + 1)] for _ in range(len(str1) + 1)]
    
    # Initialize the first row and column
    for i in range(len(str1) + 1):
        dp[i][0] = i
    for j in range(len(str2) + 1):
        dp[0][j] = j
    
    # Fill the matrix
    for i in range(1, len(str1) + 1):
        for j in range(1, len(str2) + 1):
            # If characters match, no operation needed
            if str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                # Take the minimum of three operations: insert, delete, substitute
                dp[i][j] = 1 + min(
                    dp[i - 1][j],      # deletion
                    dp[i][j - 1],      # insertion
                    dp[i - 1][j - 1]   # substitution
                )
    
    return dp[len(str1)][len(str2)]

# Example usage
print(edit_distance("jajakarta selat", "jakarta selatan"))  # Output: 3
print(edit_distance("sunday", "saturday"))  # Output: 3

4
3


In [319]:
cities_regencies.filter(pl.col('name').str.contains(r'BAT'))

id,id_provinces,name,name_clean,unigram,bigram,trigram
i64,i64,str,str,object,object,object
1207,12,"""KABUPATEN LABUHAN BATU""","""LABUHAN BATU""","['l', 'a', 'b', 'u', 'h', 'a', 'n', 'b', 'a', 't', 'u']","['la', 'ab', 'bu', 'uh', 'ha', 'an', 'nb', 'ba', 'at', 'tu']","['lab', 'abu', 'buh', 'uha', 'han', 'anb', 'nba', 'bat', 'atu']"
1219,12,"""KABUPATEN BATU BARA""","""BATU BARA""","['b', 'a', 't', 'u', 'b', 'a', 'r', 'a']","['ba', 'at', 'tu', 'ub', 'ba', 'ar', 'ra']","['bat', 'atu', 'tub', 'uba', 'bar', 'ara']"
1222,12,"""KABUPATEN LABUHAN BATU SELATAN""","""LABUHAN BATU SELATAN""","['l', 'a', 'b', 'u', 'h', 'a', 'n', 'b', 'a', 't', 'u', 's', 'e', 'l', 'a', 't', 'a', 'n']","['la', 'ab', 'bu', 'uh', 'ha', 'an', 'nb', 'ba', 'at', 'tu', 'us', 'se', 'el', 'la', 'at', 'ta', 'an']","['lab', 'abu', 'buh', 'uha', 'han', 'anb', 'nba', 'bat', 'atu', 'tus', 'use', 'sel', 'ela', 'lat', 'ata', 'tan']"
1223,12,"""KABUPATEN LABUHAN BATU UTARA""","""LABUHAN BATU UTARA""","['l', 'a', 'b', 'u', 'h', 'a', 'n', 'b', 'a', 't', 'u', 'u', 't', 'a', 'r', 'a']","['la', 'ab', 'bu', 'uh', 'ha', 'an', 'nb', 'ba', 'at', 'tu', 'uu', 'ut', 'ta', 'ar', 'ra']","['lab', 'abu', 'buh', 'uha', 'han', 'anb', 'nba', 'bat', 'atu', 'tuu', 'uut', 'uta', 'tar', 'ara']"
1504,15,"""KABUPATEN BATANG HARI""","""BATANG HARI""","['b', 'a', 't', 'a', 'n', 'g', 'h', 'a', 'r', 'i']","['ba', 'at', 'ta', 'an', 'ng', 'gh', 'ha', 'ar', 'ri']","['bat', 'ata', 'tan', 'ang', 'ngh', 'gha', 'har', 'ari']"
2171,21,"""KOTA BATAM""","""BATAM""","['b', 'a', 't', 'a', 'm']","['ba', 'at', 'ta', 'am']","['bat', 'ata', 'tam']"
3325,33,"""KABUPATEN BATANG""","""BATANG""","['b', 'a', 't', 'a', 'n', 'g']","['ba', 'at', 'ta', 'an', 'ng']","['bat', 'ata', 'tan', 'ang']"
3579,35,"""KOTA BATU""","""BATU""","['b', 'a', 't', 'u']","['ba', 'at', 'tu']","['bat', 'atu']"
5308,53,"""KABUPATEN LEMBATA""","""LEMBATA""","['l', 'e', 'm', 'b', 'a', 't', 'a']","['le', 'em', 'mb', 'ba', 'at', 'ta']","['lem', 'emb', 'mba', 'bat', 'ata']"


In [342]:
re.search(r'badung', 'ho badung')

<re.Match object; span=(3, 9), match='badung'>

In [374]:
def enhance_location(row, threshold = 0.8, all_text_threshold = 0.7):
    if row is None or len(row) == 0:
        return None

    # SPECIAL CASE: BADUNG & BANDUNG
    if re.search(r'(bandung\s?barat)', row):
        return row, [(1.1, 'bandung barat')]
    if re.search(r'(bdg\s?barat)', row):
        return row, [(1.1, 'bdg barat')]
    if re.search(r'bandung', row):
        return row, [(1.1, 'bandung')]
    if re.search(r'(bdg)', row):
        return row, [(1.1, 'bdg')]
    if re.search(r'badung', row):
        return row, [(1.1, 'badung')]

    # word by word
    match = []
    for loc_word in row.split(" "):
        if len(loc_word) == 0:
            continue
        if loc_word[0] not in city_dictionary['full_text']:
            continue
        
        for possible_loc in city_dictionary['full_text'][loc_word[0]]:
            score = jaccard_similarity(possible_loc, loc_word)
            if possible_loc in loc_word:
                score += 0.01

            if score >= threshold:
                match.append((score,loc_word, 'wbw')) # change loc_word to possible_loc for full enhancement
    # single
    if row[0] in city_dictionary['full_text']:
        for possible_loc in city_dictionary['full_text'][row[0]]:
            score = jaccard_similarity(possible_loc, row)
            if score >= all_text_threshold:
                match.append((1.01*score, row ,'sin')) # change row to possible_loc for full enhancement

    return row, sorted(match, key=lambda x: x[0], reverse=True)

In [375]:
%%time
for row in data_labeled['extracted_location'].sample(10):
    print(enhance_location(row))

None
None
('jakarta barat', [(1.01, 'jakarta barat', 'sin'), (0.7575000000000001, 'jakarta barat', 'sin')])
('bekasi', [(1.01, 'bekasi', 'wbw'), (1.01, 'bekasi', 'wbw'), (1.01, 'bekasi', 'sin'), (1.01, 'bekasi', 'sin')])
None
('jakarta pusa', [(1.01, 'jakarta pusa', 'sin'), (0.7855555555555556, 'jakarta pusa', 'sin')])
None
None
('sukabumi kab', [(1.01, 'sukabumi', 'wbw'), (1.01, 'sukabumi', 'wbw'), (0.88375, 'sukabumi kab', 'sin'), (0.88375, 'sukabumi kab', 'sin'), (0.707, 'sukabumi kab', 'sin')])
('bebekasi', [(1.01, 'bebekasi', 'wbw'), (1.01, 'bebekasi', 'wbw'), (1.01, 'bebekasi', 'sin'), (1.01, 'bebekasi', 'sin')])
CPU times: user 4.03 ms, sys: 0 ns, total: 4.03 ms
Wall time: 2.79 ms


In [1]:
enhance_location("jakarta slt")

NameError: name 'enhance_location' is not defined

In [359]:
def create_enhanced_location(row):
    enhanced = enhance_location(row)

    if len(enhanced[1]) == 0:
        return None
    if len(enhanced[1][0]) == 0:
        return None
    
    return enhanced[1][0][1]

In [376]:
%%time
data_labeled = data_labeled.with_columns(
    pl.col('extracted_location').map_elements(lambda x: create_enhanced_location(x), return_dtype=pl.String).alias('cleaned_location')
)

CPU times: user 41.5 s, sys: 94.5 ms, total: 41.6 s
Wall time: 41.4 s


In [377]:
data_labeled

source,merchant_name,merchant_location,online_offline_flag,country_code,word_list,location_index,extracted_location,enhanced_location,cleaned_location
str,str,str,str,str,list[str],list[i32],str,str,str
"""CREDIT_CARD""","""CHOP BUNTUT CAK YO SLEMAN (KAB…",,,"""ID""","[""chop"", ""buntut"", … ""kab""]","[3, 4]","""yo sleman""","""sleman""","""sleman"""
"""CREDIT_CARD""","""TBL* THE SEED LEVEL AMSTERDAM …",,,"""NL""","[""tbl"", ""the"", … ""amsterdam""]",,,,
"""CREDIT_CARD""","""SILOAM CLINIC GRH RAYA TANGERA…",,,"""ID""","[""siloam"", ""clinic"", … ""slt""]","[4, 4]","""tangerang""","""tangerang""","""tangerang"""
"""CREDIT_CARD""","""SWALAYAN ZOOM MBL TANJUNG PINA…",,,"""ID""","[""swalayan"", ""zoom"", … ""pinan""]","[3, 4]","""tanjung pinan""","""tanjung pinang""","""tanjung pinan"""
"""CREDIT_CARD""","""PEDRO SUN PLAZA MEDAN MEDAN KO…",,,"""ID""","[""pedro"", ""sun"", … ""kot""]","[3, 4]","""medan medan""","""medan""","""medan"""
…,…,…,…,…,…,…,…,…,…
"""CREDIT_CARD""","""AUSSIE TOYS ONLINE PAKENHAM AU""",,,"""AU""","[""aussie"", ""toys"", … ""pakenham""]",,,,
"""CREDIT_CARD""","""TOKO EMAS GINBERS MEDAN KOT. I…",,"""OFFLINE""","""ID""","[""toko"", ""emas"", … ""kot""]","[3, 3]","""medan""","""medan""","""medan"""
"""CREDIT_CARD""","""BINTANG SPORT SERANG (KOTA)ID""",,,"""ID""","[""bintang"", ""sport"", … ""kota""]","[2, 2]","""serang""","""serang""","""serang"""
"""CREDIT_CARD""","""ACE HARDWARE MALL PI I JAKARTA…",,,"""ID""","[""ace"", ""hardware"", … ""jakarta""]","[-1, -1]",,,


In [379]:
data_labeled.filter(pl.col('extracted_location').is_not_null() & pl.col('enhanced_location').is_null()).write_json('failed_location_enhancement.json')

In [80]:
jaccard_similarity("JAKARTA SELAT", "JAKARTA SELATAN")

0.9

In [51]:
city_dictionary['full_text']['y']

['yogyakarta', 'yahukimo', 'yalimo']

In [380]:
data_labeled

source,merchant_name,merchant_location,online_offline_flag,country_code,word_list,location_index,extracted_location,enhanced_location,cleaned_location
str,str,str,str,str,list[str],list[i32],str,str,str
"""CREDIT_CARD""","""CHOP BUNTUT CAK YO SLEMAN (KAB…",,,"""ID""","[""chop"", ""buntut"", … ""kab""]","[3, 4]","""yo sleman""","""sleman""","""sleman"""
"""CREDIT_CARD""","""TBL* THE SEED LEVEL AMSTERDAM …",,,"""NL""","[""tbl"", ""the"", … ""amsterdam""]",,,,
"""CREDIT_CARD""","""SILOAM CLINIC GRH RAYA TANGERA…",,,"""ID""","[""siloam"", ""clinic"", … ""slt""]","[4, 4]","""tangerang""","""tangerang""","""tangerang"""
"""CREDIT_CARD""","""SWALAYAN ZOOM MBL TANJUNG PINA…",,,"""ID""","[""swalayan"", ""zoom"", … ""pinan""]","[3, 4]","""tanjung pinan""","""tanjung pinang""","""tanjung pinan"""
"""CREDIT_CARD""","""PEDRO SUN PLAZA MEDAN MEDAN KO…",,,"""ID""","[""pedro"", ""sun"", … ""kot""]","[3, 4]","""medan medan""","""medan""","""medan"""
…,…,…,…,…,…,…,…,…,…
"""CREDIT_CARD""","""AUSSIE TOYS ONLINE PAKENHAM AU""",,,"""AU""","[""aussie"", ""toys"", … ""pakenham""]",,,,
"""CREDIT_CARD""","""TOKO EMAS GINBERS MEDAN KOT. I…",,"""OFFLINE""","""ID""","[""toko"", ""emas"", … ""kot""]","[3, 3]","""medan""","""medan""","""medan"""
"""CREDIT_CARD""","""BINTANG SPORT SERANG (KOTA)ID""",,,"""ID""","[""bintang"", ""sport"", … ""kota""]","[2, 2]","""serang""","""serang""","""serang"""
"""CREDIT_CARD""","""ACE HARDWARE MALL PI I JAKARTA…",,,"""ID""","[""ace"", ""hardware"", … ""jakarta""]","[-1, -1]",,,


# CREATE ENTITY INDEX FROM CLEANED MERCHANT NAME

Description preprocessing (Training pipeline)

1. Extract and remove last 2 characters (Country code)

2. Remove non-alphanumeric

3. Remove extra spaces (should be just 1 space)

4. Enhance extracted location (completion & deletion)

5. Put the enhanced location back to description

6. Extract location starting & ending index

DONE - Go to Training

In [448]:
data_labeled_train = data_labeled[['merchant_name','country_code','enhanced_location','cleaned_location']]

In [313]:
def clean_merchant_name(text):
    remainder = text
    # last two characters are country id
    country_id = remainder[len(text)-2] + remainder[len(text)-1]
    remainder = text[:len(text)-2].lower()

    #return country_id, get_word_ngram(remainder.strip(),1)
    return re.sub(r'[^a-zA-Z0-9\s]','',remainder).strip()

In [450]:
data_labeled_train = data_labeled_train.with_columns(
    pl.col('merchant_name').map_elements(lambda x: clean_merchant_name(x), return_dtype=pl.String).alias('cleaned_merchant_name')
)

In [451]:
data_labeled_train = data_labeled_train[['merchant_name','cleaned_merchant_name','country_code','enhanced_location','cleaned_location']]

In [452]:
data_labeled_train

merchant_name,cleaned_merchant_name,country_code,enhanced_location,cleaned_location
str,str,str,str,str
"""CHOP BUNTUT CAK YO SLEMAN (KAB…","""chop buntut cak yo sleman kab""","""ID""","""sleman""","""sleman"""
"""TBL* THE SEED LEVEL AMSTERDAM …","""tbl the seed level amsterdam""","""NL""",,
"""SILOAM CLINIC GRH RAYA TANGERA…","""siloam clinic grh raya tangera…","""ID""","""tangerang""","""tangerang"""
"""SWALAYAN ZOOM MBL TANJUNG PINA…","""swalayan zoom mbl tanjung pina…","""ID""","""tanjung pinang""","""tanjung pinan"""
"""PEDRO SUN PLAZA MEDAN MEDAN KO…","""pedro sun plaza medan medan ko…","""ID""","""medan""","""medan"""
…,…,…,…,…
"""AUSSIE TOYS ONLINE PAKENHAM AU""","""aussie toys online pakenham""","""AU""",,
"""TOKO EMAS GINBERS MEDAN KOT. I…","""toko emas ginbers medan kot""","""ID""","""medan""","""medan"""
"""BINTANG SPORT SERANG (KOTA)ID""","""bintang sport serang kota""","""ID""","""serang""","""serang"""
"""ACE HARDWARE MALL PI I JAKARTA…","""ace hardware mall pi i jakarta""","""ID""",,


In [434]:
def find_index(row):
    if row['cleaned_location'] is None:
        return None
    
    start = row['cleaned_merchant_name'].index(row['cleaned_location'])
    end = start + len(row['cleaned_location'])
    return [start,end]


In [453]:
# find starting and ending index
data_labeled_train = data_labeled_train.with_columns(
    pl.struct(["cleaned_merchant_name","cleaned_location"]).map_elements(lambda x: find_index(x), return_dtype = pl.List(pl.Int32)).alias('loc_entity_index')
)

In [454]:
data_labeled_train

merchant_name,cleaned_merchant_name,country_code,enhanced_location,cleaned_location,loc_entity_index
str,str,str,str,str,list[i32]
"""CHOP BUNTUT CAK YO SLEMAN (KAB…","""chop buntut cak yo sleman kab""","""ID""","""sleman""","""sleman""","[19, 25]"
"""TBL* THE SEED LEVEL AMSTERDAM …","""tbl the seed level amsterdam""","""NL""",,,[]
"""SILOAM CLINIC GRH RAYA TANGERA…","""siloam clinic grh raya tangera…","""ID""","""tangerang""","""tangerang""","[23, 32]"
"""SWALAYAN ZOOM MBL TANJUNG PINA…","""swalayan zoom mbl tanjung pina…","""ID""","""tanjung pinang""","""tanjung pinan""","[18, 31]"
"""PEDRO SUN PLAZA MEDAN MEDAN KO…","""pedro sun plaza medan medan ko…","""ID""","""medan""","""medan""","[16, 21]"
…,…,…,…,…,…
"""AUSSIE TOYS ONLINE PAKENHAM AU""","""aussie toys online pakenham""","""AU""",,,[]
"""TOKO EMAS GINBERS MEDAN KOT. I…","""toko emas ginbers medan kot""","""ID""","""medan""","""medan""","[18, 23]"
"""BINTANG SPORT SERANG (KOTA)ID""","""bintang sport serang kota""","""ID""","""serang""","""serang""","[14, 20]"
"""ACE HARDWARE MALL PI I JAKARTA…","""ace hardware mall pi i jakarta""","""ID""",,,[]


In [459]:
def add_flag_column(data, condition, flag_column_name, flag_value=None):
    # Create a dictionary with the column name as key and the when-then-otherwise expression as value
    column_expr = {
        flag_column_name: pl.when((pl.col('country_code') == 'ID') & pl.col('enhanced_location').is_null() & condition)
                            .then(pl.lit(flag_value))
                            .otherwise(flag_column_name)
    }
    
    # Add the column to the DataFrame
    return data.with_columns(**column_expr)

In [None]:
data_labeled_train.filter((pl.col('country_code') == 'ID') & pl.col('enhanced_location').is_null()).write_csv("unextracted_location.csv")

In [546]:
data_labeled_train

merchant_name,cleaned_merchant_name,country_code,enhanced_location,cleaned_location,loc_entity_index
str,str,str,str,str,list[i32]
"""CHOP BUNTUT CAK YO SLEMAN (KAB…","""chop buntut cak yo sleman kab""","""ID""","""sleman""","""sleman""","[19, 25]"
"""TBL* THE SEED LEVEL AMSTERDAM …","""tbl the seed level amsterdam""","""NL""",,,[]
"""SILOAM CLINIC GRH RAYA TANGERA…","""siloam clinic grh raya tangera…","""ID""","""tangerang""","""tangerang""","[23, 32]"
"""SWALAYAN ZOOM MBL TANJUNG PINA…","""swalayan zoom mbl tanjung pina…","""ID""","""tanjung pinang""","""tanjung pinan""","[18, 31]"
"""PEDRO SUN PLAZA MEDAN MEDAN KO…","""pedro sun plaza medan medan ko…","""ID""","""medan""","""medan""","[16, 21]"
…,…,…,…,…,…
"""AUSSIE TOYS ONLINE PAKENHAM AU""","""aussie toys online pakenham""","""AU""",,,[]
"""TOKO EMAS GINBERS MEDAN KOT. I…","""toko emas ginbers medan kot""","""ID""","""medan""","""medan""","[18, 23]"
"""BINTANG SPORT SERANG (KOTA)ID""","""bintang sport serang kota""","""ID""","""serang""","""serang""","[14, 20]"
"""ACE HARDWARE MALL PI I JAKARTA…","""ace hardware mall pi i jakarta""","""ID""",,"""jakarta""","[23, 30]"


In [516]:
# extract location with regex
flags = [
    ((pl.col("cleaned_merchant_name").str.contains("jakarta slt")),"jakarta slt"),
    ((pl.col("cleaned_merchant_name").str.contains("jakarta selat")),"jakarta selat"),
    ((pl.col("cleaned_merchant_name").str.contains("jakarta selatan")),"jakarta selatan"),
    ((pl.col("cleaned_merchant_name").str.contains("jaksel")),"jaksel"),
    ((pl.col("cleaned_merchant_name").str.contains("jakarta pusat")),"jakarta pusat"),
    ((pl.col("cleaned_merchant_name").str.contains("jakpus")),"jakpus"),
    ((pl.col("cleaned_merchant_name").str.contains("jakarta barat")),"jakarta barat"),
    ((pl.col("cleaned_merchant_name").str.contains("jakbar")),"jakbar"),
    ((pl.col("cleaned_merchant_name").str.contains("jakarta utara")),"jakarta utara"),
    ((pl.col("cleaned_merchant_name").str.contains("jakut")),"jakut"),
    ((pl.col("cleaned_merchant_name").str.contains("jakarta timur")),"jakarta timur"),
    ((pl.col("cleaned_merchant_name").str.contains("jaktim")),"jaktim"),
    ((pl.col("cleaned_merchant_name").str.contains("makassar")),"makassar"),
    ((pl.col("cleaned_merchant_name").str.contains("bekasi")),"bekasi"),
    ((pl.col("cleaned_merchant_name").str.contains("bogor")),"bogor"),
    ((pl.col("cleaned_merchant_name").str.contains("surabaya")),"surabaya"),
    ((pl.col("cleaned_merchant_name").str.contains("medan")),"medan"),
    ((pl.col("cleaned_merchant_name").str.contains("muara bungo")),"muara bungo"),
    ((pl.col("cleaned_merchant_name").str.contains("sumbawabesar")),"sumbawabesar"),
    ((pl.col("cleaned_merchant_name").str.contains("limboto")),"limboto"),
    ((pl.col("cleaned_merchant_name").str.contains("manado")),"manado"),
    ((pl.col("cleaned_merchant_name").str.contains("subang")),"subang"),
    ((pl.col("cleaned_merchant_name").str.contains("lombok t")),"lombok t"),
    ((pl.col("cleaned_merchant_name").str.contains("cimahi")),"cimahi"),
    ((pl.col("cleaned_merchant_name").str.contains("bdg barat")),"bdg barat"),
    ((pl.col("cleaned_merchant_name").str.contains("bandung barat")),"bandung barat"),
    ((pl.col("cleaned_merchant_name").str.contains("bandungbarat")),"bandungbarat"),
    ((pl.col("cleaned_merchant_name").str.contains("majalengka")),"majalengka"),
    ((pl.col("cleaned_merchant_name").str.contains("majaleng")),"majaleng"),
    ((pl.col("cleaned_merchant_name").str.contains("rantauprapat")),"rantauprapat"),
    ((pl.col("cleaned_merchant_name").str.contains("majaleng")),"majaleng"),
    ((pl.col("cleaned_merchant_name").str.contains("banyuwangi")),"banyuwangi"),
    ((pl.col("cleaned_merchant_name").str.contains("bdr lampung")),"bdr lampung"),
    ((pl.col("cleaned_merchant_name").str.contains("bandar lampung")),"bandar lampung"),
    ((pl.col("cleaned_merchant_name").str.contains("jakarta$")),"jakarta"),
    ((pl.col("cleaned_merchant_name").str.contains("tangerang sel")),"tangerang sel"),
    ((pl.col("cleaned_merchant_name").str.contains("tangerang slt")),"tangerang slt"),
    ((pl.col("cleaned_merchant_name").str.contains("tangerang ka")),"tangerang ka"),
    ((pl.col("cleaned_merchant_name").str.contains("tangerang selatan")),"tangerang selatan"),
    ((pl.col("cleaned_merchant_name").str.contains("tjg pinang")),"tjg pinang"),
    ((pl.col("cleaned_merchant_name").str.contains("semarang")),"semarang"),
    ((pl.col("cleaned_merchant_name").str.contains("depok")),"depok"),
    ((pl.col("cleaned_merchant_name").str.contains("bandung$")),"bandung"),
    ((pl.col("cleaned_merchant_name").str.contains("tangerang$")),"tangerang"),
    ((pl.col("cleaned_merchant_name").str.contains("denpasar")),"denpasar"),
    ((pl.col("cleaned_merchant_name").str.contains("karawang")),"karawang"),
    ((pl.col("cleaned_merchant_name").str.contains("bandung kab")),"bandung kab"),
]

In [517]:
for flag in flags:
    data_labeled_train = add_flag_column(data_labeled_train, flag[0], "cleaned_location", flag[1])

In [518]:
data_labeled_train.filter((pl.col('country_code') == 'ID') & pl.col('cleaned_location').is_null())

merchant_name,cleaned_merchant_name,country_code,enhanced_location,cleaned_location,loc_entity_index
str,str,str,str,str,list[i32]
"""MSIGLIF-REECURRING 37202173558…","""msiglifreecurring 372021735580…","""ID""",,,[]
"""SPBU 7490616 PANGKEP-H PANGKAJ…","""spbu 7490616 pangkeph pangkaje…","""ID""",,,[]
"""IDM TRRH-RAYA SERDANG KRASERAN…","""idm trrhraya serdang kraserang""","""ID""",,,[]
"""ALFAMART AN 2AH2 BEGADUNGKOTA …","""alfamart an 2ah2 begadungkota …","""ID""",,,[]
"""TUKAR TAMBAH BENCOOLEN MABENGK…","""tukar tambah bencoolen mabengk…","""ID""",,,[]
…,…,…,…,…,…
"""AIRASIA_QZ R379PB INDONESIA ID""","""airasiaqz r379pb indonesia""","""ID""",,,[]
"""BSI SBILL TELKOMSEL 0811805228…","""bsi sbill telkomsel 0811805228…","""ID""",,,[]
"""5148 SH EG EX BP PARIS VABANDU…","""5148 sh eg ex bp paris vabandu…","""ID""",,,[]
"""SAGA FRESH CIGOMBONG. MBLJAYAP…","""saga fresh cigombong mbljayapu…","""ID""",,,[]


In [519]:
data_labeled_train[['merchant_name','cleaned_merchant_name','country_code','enhanced_location','cleaned_location']].filter((pl.col('country_code') == 'ID') & pl.col('cleaned_location').is_null()).sort("cleaned_merchant_name").write_csv("unextracted_location.csv")

In [523]:
data_labeled_train

merchant_name,cleaned_merchant_name,country_code,enhanced_location,cleaned_location,loc_entity_index
str,str,str,str,str,list[i32]
"""CHOP BUNTUT CAK YO SLEMAN (KAB…","""chop buntut cak yo sleman kab""","""ID""","""sleman""","""sleman""","[19, 25]"
"""TBL* THE SEED LEVEL AMSTERDAM …","""tbl the seed level amsterdam""","""NL""",,,[]
"""SILOAM CLINIC GRH RAYA TANGERA…","""siloam clinic grh raya tangera…","""ID""","""tangerang""","""tangerang""","[23, 32]"
"""SWALAYAN ZOOM MBL TANJUNG PINA…","""swalayan zoom mbl tanjung pina…","""ID""","""tanjung pinang""","""tanjung pinan""","[18, 31]"
"""PEDRO SUN PLAZA MEDAN MEDAN KO…","""pedro sun plaza medan medan ko…","""ID""","""medan""","""medan""","[16, 21]"
…,…,…,…,…,…
"""AUSSIE TOYS ONLINE PAKENHAM AU""","""aussie toys online pakenham""","""AU""",,,[]
"""TOKO EMAS GINBERS MEDAN KOT. I…","""toko emas ginbers medan kot""","""ID""","""medan""","""medan""","[18, 23]"
"""BINTANG SPORT SERANG (KOTA)ID""","""bintang sport serang kota""","""ID""","""serang""","""serang""","[14, 20]"
"""ACE HARDWARE MALL PI I JAKARTA…","""ace hardware mall pi i jakarta""","""ID""",,"""jakarta""","[23, 30]"


In [521]:
# find starting and ending index
data_labeled_train = data_labeled_train.with_columns(
    pl.struct(["cleaned_merchant_name","cleaned_location"]).map_elements(lambda x: find_index(x), return_dtype = pl.List(pl.Int32)).alias('loc_entity_index')
)

In [529]:
data_labeled_train.write_parquet('data/merchant_ner_labeled_all.parquet')

# NER TRAINING

In [532]:
to_split = data_labeled_train.filter(pl.col('cleaned_location').is_not_null())

In [533]:
to_split

merchant_name,cleaned_merchant_name,country_code,enhanced_location,cleaned_location,loc_entity_index
str,str,str,str,str,list[i32]
"""CHOP BUNTUT CAK YO SLEMAN (KAB…","""chop buntut cak yo sleman kab""","""ID""","""sleman""","""sleman""","[19, 25]"
"""SILOAM CLINIC GRH RAYA TANGERA…","""siloam clinic grh raya tangera…","""ID""","""tangerang""","""tangerang""","[23, 32]"
"""SWALAYAN ZOOM MBL TANJUNG PINA…","""swalayan zoom mbl tanjung pina…","""ID""","""tanjung pinang""","""tanjung pinan""","[18, 31]"
"""PEDRO SUN PLAZA MEDAN MEDAN KO…","""pedro sun plaza medan medan ko…","""ID""","""medan""","""medan""","[16, 21]"
"""AIA FIN 39020926 112 TANGERANG…","""aia fin 39020926 112 tangerang""","""ID""","""tangerang""","""tangerang""","[21, 30]"
…,…,…,…,…,…
"""IDM TRWR-KARANG MULYA BANJARMA…","""idm trwrkarang mulya banjarmas…","""ID""","""banjarmasin""","""banjarmasin""","[21, 32]"
"""TOKO EMAS GINBERS MEDAN KOT. I…","""toko emas ginbers medan kot""","""ID""","""medan""","""medan""","[18, 23]"
"""BINTANG SPORT SERANG (KOTA)ID""","""bintang sport serang kota""","""ID""","""serang""","""serang""","[14, 20]"
"""ACE HARDWARE MALL PI I JAKARTA…","""ace hardware mall pi i jakarta""","""ID""",,"""jakarta""","[23, 30]"


In [331]:
# Alternative implementation using Polars' built-in methods
def split_polars_with_sample(df, train_ratio=0.8, seed=None):
    """
    Split a Polars DataFrame using Polars' sample method.
    
    Parameters:
    -----------
    df : polars.DataFrame
        The input DataFrame to split
    train_ratio : float
        The proportion of data to include in the training set (default: 0.8)
    seed : int or None
        Random seed for reproducibility (default: None)
    
    Returns:
    --------
    tuple of polars.DataFrame
        (train_df, test_df)
    """
    # Sample the training set
    train_df = df.sample(fraction=train_ratio, seed=seed)
    
    # Create a temporary column with row indices
    df_with_idx = df.with_row_index("__idx")
    train_with_idx = train_df.with_row_index("__idx")
    
    # Find indices not in the training set
    test_df = df_with_idx.filter(
        ~pl.col("__idx").is_in(train_with_idx.get_column("__idx"))
    ).drop("__idx")
    
    # Remove the temporary index column from train_df
    train_df = train_with_idx.drop("__idx")
    
    return train_df, test_df

In [330]:
split_train, split_validation = split_polars_with_sample(to_split, train_ratio=0.9, seed=1)

NameError: name 'split_polars_with_sample' is not defined

In [538]:
split_train.write_parquet('data/merchant_ner_labeled_train.parquet')

In [539]:
split_validation.write_parquet('data/merchant_ner_labeled_validation.parquet')

In [332]:
train = pl.read_parquet('data/merchant_ner_labeled_train.parquet')

In [333]:
train

merchant_name,cleaned_merchant_name,country_code,enhanced_location,cleaned_location,loc_entity_index
str,str,str,str,str,list[i32]
"""Z003 ELZATTA SURABAYA BANDUNG …","""z003 elzatta surabaya bandung …","""ID""","""surabaya""","""surabaya""","[13, 21]"
"""SOUR SALLY MALL CENTRAL PJAKAR…","""sour sally mall central pjakar…","""ID""",,"""jakarta barat""","[25, 38]"
"""GUMATI CIREBON CIREBON KAB. ID""","""gumati cirebon cirebon kab""","""ID""","""cirebon""","""cirebon""","[7, 14]"
"""KOBE PETSHOP MBL JAKARTA ID""","""kobe petshop mbl jakarta""","""ID""",,"""jakarta""","[17, 24]"
"""DUNIA LISTRIK TOKO - H MADIUN …","""dunia listrik toko h madiun""","""ID""","""madiun""","""madiun""","[22, 28]"
…,…,…,…,…,…
"""MIE BANDUNGKEJAKSAAN 1 TANGERA…","""mie bandungkejaksaan 1 tangera…","""ID""","""tangerang""","""tangerang""","[23, 32]"
"""GUNUNG GEULIS COUNTRY C BOGOR …","""gunung geulis country c bogor""","""ID""","""bogor""","""bogor""","[24, 29]"
"""SAGA FRESH DOK 9 JAYAPURA (KO …","""saga fresh dok 9 jayapura ko""","""ID""","""jayapura""","""jayapura""","[17, 25]"
"""BILLBOSS BANJARMASIN KID""","""billboss banjarmasin k""","""ID""","""banjarmasin""","""banjarmasin""","[9, 20]"


In [7]:
import spacy
from spacy import displacy

nlp = spacy.blank("id")

In [15]:
nlp

<spacy.lang.id.Indonesian at 0x7fa6dc312650>

In [48]:
train_spacy = []

In [149]:
spacy_data = []

In [150]:
for row in train.iter_rows():
    spacy_data.append((row[1], {"entities": [(row[5][0], row[5][1], "LOC")]}))

In [151]:
import random
random.shuffle(spacy_data)
train_data = spacy_data[:int(len(spacy_data) * 0.8)]
test_data = spacy_data[int(len(spacy_data) * 0.8):]

In [152]:
len(train_data)

224012

In [153]:
def create_model(train_data):
    nlp = spacy.blank("id")  # create blank model (adjust language as needed)
    print("Created blank model")
    
    # Add NER component if it doesn't exist
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    else:
        ner = nlp.get_pipe("ner")
    
    # Add entity labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    
    return nlp

In [None]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [83]:
count_span_none = 0
span_none = []
for text, annots in train_data:
    #print(text, annots)
    break_loop = False
    doc = nlp.make_doc(text)
    for start, end, label in annots.get("entities"):
        span = doc.char_span(start, end, label=label)
        if span is None:
            span_none.append((text, annots))


In [None]:
...................|...........
mybluebird 96961057jakarta slt

In [None]:
nlp.make_doc(text)

In [145]:
# 4. Convert data to Spacy format and save
from spacy.tokens import DocBin
from spacy.util import filter_spans

def convert_to_spacy(data, output_path):
    db = DocBin()
    nlp_model = spacy.blank("id")
    
    for text, annots in data:
        doc = nlp_model.make_doc(text)
        ents = []
        for start, end, label in annots.get("entities"):
            span = doc.char_span(start, end, label=label)
            # if span is None:
            #     print(f"Skipping entity in '{text}' - invalid span")
            if span is not None:
                ents.append(span)
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents
        db.add(doc)

    db.to_disk(output_path)
    print(f"Saved {len(data)} examples to {output_path}")

In [None]:
train_data

In [237]:
convert_to_spacy(train_data, "data/spacy_training_data.spacy")
convert_to_spacy(test_data, "data/spacy_test_data.spacy")

Saved 224012 examples to data/spacy_training_data.spacy
Saved 56003 examples to data/spacy_test_data.spacy


In [232]:
nlp = create_model(train_data)

Created blank model


In [148]:
!python -m spacy init fill-config base_config.cfg config.cfg

[33mUsage: [0mpython [1;32m-m[0m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE]
[2mTry [0m[2;34m'python [0m[1;2;34m-m[0m[2;34m spacy init fill-config [0m[1;2;34m-[0m[1;2;34m-help[0m[2;34m'[0m[2m for help.[0m
[31m╭─[0m[31m Error [0m[31m─────────────────────────────────────────────────────────────────────[0m[31m─╮[0m
[31m│[0m Invalid value for 'BASE_PATH': File 'base_config.cfg' does not exist.        [31m│[0m
[31m╰──────────────────────────────────────────────────────────────────────────────╯[0m


In [240]:
def load_data_from_spacy_format(file_path, nlp=None):
    """
    Load training/evaluation data from a .spacy binary file.
    
    Parameters:
    -----------
    file_path : str
        Path to the .spacy file
    nlp : spacy.Language, optional
        spaCy language model, will create a blank one if not provided
        
    Returns:
    --------
    list
        List of (text, annotations) tuples in the format needed for training
    """
    
    # Create blank model if none provided
    if nlp is None:
        nlp = spacy.blank("id")  # Use appropriate language code
    
    # Load the DocBin
    doc_bin = DocBin().from_disk(file_path)
    
    # Convert to docs
    docs = list(doc_bin.get_docs(nlp.vocab))
    
    # Convert back to the training data format
    training_data = []
    for doc in docs:
        text = doc.text
        entities = []
        for ent in doc.ents:
            entities.append((ent.start_char, ent.end_char, ent.label_))
        
        # Create the annotation dictionary
        annotations = {"entities": entities}
        
        # Add to the training data list
        training_data.append((text, annotations))
    
    print(f"Loaded {len(training_data)} examples from {file_path}")
    return training_data

# Usage
train_data = load_data_from_spacy_format("data/spacy_training_data.spacy")
test_data = load_data_from_spacy_format("data/spacy_test_data.spacy")

Loaded 224012 examples from data/spacy_training_data.spacy
Loaded 56003 examples from data/spacy_test_data.spacy


In [233]:
nlp.pipe_names

['ner']

In [239]:
len(train_data)

224012

In [242]:
# Setup training examples
from spacy.training import Example
train_examples = []
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
train_examples.append(example)

In [159]:
train_examples[0]

{'doc_annotation': {'cats': {}, 'entities': ['O', 'O', 'U-LOC'], 'spans': {}, 'links': {}}, 'token_annotation': {'ORTH': ['mang', 'gangho', 'tangerang'], 'SPACY': [True, True, False], 'TAG': ['', '', ''], 'LEMMA': ['', '', ''], 'POS': ['', '', ''], 'MORPH': ['', '', ''], 'HEAD': [0, 1, 2], 'DEP': ['', '', ''], 'SENT_START': [1, 0, 0]}}

In [177]:
ner = nlp.get_pipe("ner")
#ner.add_label("O")
#ner.add_label("LOC")

0

In [243]:
optimizer = nlp.begin_training()

In [230]:
optimizer.learn_rate = 0.01

In [244]:
nlp.pipe_names
examples[:1]

[{'doc_annotation': {'cats': {}, 'entities': ['O', 'O', 'O', 'B-LOC', 'L-LOC'], 'spans': {}, 'links': {}}, 'token_annotation': {'ORTH': ['terra', 'factory', 'outlet', 'jakarta', 'timur'], 'SPACY': [True, True, True, True, False], 'TAG': ['', '', '', '', ''], 'LEMMA': ['', '', '', '', ''], 'POS': ['', '', '', '', ''], 'MORPH': ['', '', '', '', ''], 'HEAD': [0, 1, 2, 3, 4], 'DEP': ['', '', '', '', ''], 'SENT_START': [1, 0, 0, 0, 0]}}]

In [245]:
nlp.update(examples[:1], drop=0.5, losses=losses)

{'ner': 1293.82568359375}

In [321]:
# Batch up the examples
from spacy.util import minibatch, compounding
n_iter=10
print("Training model...")
for i in range(n_iter):
    #random.shuffle(train_data)
    losses = {}
    
    # Batch the examples
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(batch, drop=0.4, losses=losses)
    
    print(f"Iteration {i+1}, Losses: {losses}")

Training model...
Iteration 1, Losses: {'ner': np.float32(6571.7866)}
Iteration 2, Losses: {'ner': np.float32(6769.374)}
Iteration 3, Losses: {'ner': np.float32(6651.9404)}
Iteration 4, Losses: {'ner': np.float32(6656.863)}
Iteration 5, Losses: {'ner': np.float32(6418.442)}
Iteration 6, Losses: {'ner': np.float32(6274.0923)}
Iteration 7, Losses: {'ner': np.float32(6338.069)}
Iteration 8, Losses: {'ner': np.float32(6130.4077)}
Iteration 9, Losses: {'ner': np.float32(6131.9736)}
Iteration 10, Losses: {'ner': np.float32(6104.8755)}


In [325]:
# 6. Evaluate the model
def evaluate_model(nlp, eval_data):
    scorer = spacy.scorer.Scorer()
    examples = []
    
    for text, annotations in eval_data:
        doc_gold_text = nlp.make_doc(text)
        gold = Example.from_dict(doc_gold_text, annotations)
        pred_value = nlp(text)
        examples.append(Example(pred_value, gold.reference))
    
    scores = scorer.score(examples)
    
    # Print results
    print("Evaluation results:")
    for metric, value in scores.items():
        if metric.startswith("ents"):
            # Handle different types of values
            if isinstance(value, (int, float)):
                print(f"{metric}: {value:.4f}")
            else:
                print(f"{metric}: {value}")

In [256]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [336]:
evaluate_model(nlp, test_data)

Evaluation results:
ents_p: 0.9229
ents_r: 0.9959
ents_f: 0.9580
ents_per_type: {'LOC': {'p': 0.9228676273158507, 'r': 0.9959432832361009, 'f': 0.9580139550872889}}


In [337]:
val = pl.read_parquet('data/merchant_ner_labeled_validation.parquet')
validation_data = []
for row in val.iter_rows():
    validation_data.append((row[1], {"entities": [(row[5][0], row[5][1], "LOC")]}))

In [338]:
evaluate_model(nlp, validation_data)

Evaluation results:
ents_p: 0.9942
ents_r: 0.9957
ents_f: 0.9950
ents_per_type: {'LOC': {'p': 0.9941963509991312, 'r': 0.9957189098882739, 'f': 0.9949570479602129}}


In [342]:
validation_data[:5]

[('cafe tutur bekasi', {'entities': [(11, 17, 'LOC')]}),
 ('idm t430 menanggal surabaya', {'entities': [(9, 18, 'LOC')]}),
 ('rejuve gandaria city 2 jakarta slt', {'entities': [(23, 34, 'LOC')]}),
 ('imperial kitchen  dim kakarawang kab', {'entities': [(22, 32, 'LOC')]}),
 ('alma restoran manggarai bar', {'entities': [(14, 23, 'LOC')]})]

In [348]:
LHMGS-GR1 TANGERANG KOTI : 0/12
test_text = "rejuve gandaria city 2 jakarta slt"

In [349]:
doc = nlp(test_text.lower())
doc
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}, Position: ({ent.start_char}, {ent.end_char})")

Entity: jakarta slt, Label: LOC, Position: (23, 34)


In [329]:
# Save to disk
output_dir = "model/indonesian_location_ner_model"
nlp.to_disk(output_dir)

# LOAD THE MODEL

In [None]:
import spacy
# Load the saved model
loaded_nlp = spacy.load("model/indonesian_location_ner_model")

In [374]:
# Test on some text
text = "YASUI MBL BADUNG - BALIID"
text_clean = clean_merchant_name(text)
doc = loaded_nlp(text_clean)

# Print entities
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")

badung - LOC


# SANDBOX

In [215]:
import spacy

# Load a model for your language (example with English)
nlp = spacy.load("en_core_web_sm")  # Use Indonesian model if available

def is_location_ner(word, context=""):
    """Use NER to determine if a word is a location."""
    # If we have context, use it
    text = context if context else word
    doc = nlp(text)

    # Check if the word is tagged as a location
    for entity in doc.ents:
        print(entity.text, entity.label_)
        if entity.text == word and entity.label_ == "GPE":  # GPE = Geopolitical Entity
            return True
    
    return False

In [227]:
is_location_ner("KUALA LUMPUR")

KUALA ORG


False

In [217]:
from transformers import pipeline

# Load a pretrained NER pipeline
ner = pipeline("ner", model="cahya/bert-base-indonesian-NER")

def is_location(text):
    """Determine if text contains a location using pretrained NER model."""
    results = ner(text)
    
    # Filter for location entities (labels may vary by model)
    location_entities = [entity for entity in results if entity['entity'] in 
                        ['B-LOC', 'I-LOC', 'B-GPE', 'I-GPE', 'LOC', 'GPE']]
    
    if location_entities:
        return True, location_entities
    return False, []

Some weights of the model checkpoint at cahya/bert-base-indonesian-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [225]:
is_location("SHIBUYA")

(False, [])

In [51]:
import spacy
from spacy import displacy

nlp = spacy.blank("id")

In [44]:
text = "The Mars Orbiter Mission (MOM), informally known as Mangalyaan, was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) and has entered Mars orbit on 24 September 2014. India thus became the first country to enter Mars orbit on its first attempt. It was completed at a record low cost of $74 million."

In [52]:
nlp.pipe_names

[]

In [53]:
ner_en = spacy.load("en_core_web_sm")

In [54]:
ner_en.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [45]:
text2 = NER(text)