In [179]:
import spacy
import re
import polars as pl
# Load the saved model
loaded_nlp = spacy.load("model/indonesian_location_ner_model")

In [180]:
# Test on some text
def clean_merchant_name(text):
    remainder = text
    # last two characters are country id
    country_id = remainder[len(text)-2] + remainder[len(text)-1]
    remainder = text[:len(text)-2].lower()

    return country_id, re.sub(r'[^a-zA-Z0-9\s]','',remainder).strip()

text = "DIAN KENANGA TOTOK AURA JAKARTA SLT ID"
country_id, text_clean = clean_merchant_name(text)
doc = loaded_nlp(text_clean)

# Print entities
for ent in doc.ents:
    print(f"{country_id}: {ent.text} - {ent.label_}")



ID: jakarta slt - LOC


In [181]:
# text augmentation
import pickle
file = open('data/city_dictionary.pkl', 'rb')
city_dictionary = pickle.load(file)
file.close()

def jaccard_similarity(ngrams1, ngrams2):
    _ngrams1 = set(ngrams1)
    _ngrams2 = set(ngrams2)

    intersection = len(_ngrams1.intersection(_ngrams2))
    union = len(_ngrams1.union(_ngrams2))

    return intersection / union

def enhance_location_from_dict(row, cba_threshold = 0.6, threshold = 0.8, all_text_threshold = 0.7):
    if row is None or len(row) == 0:
        return None

    # SPECIAL CASE: BADUNG & BANDUNG
    if re.search(r'(bandung\s?barat)', row):
        return row, [(1.1, 'bandung barat')]
    if re.search(r'(bdg\s?barat)', row):
        return row, [(1.1, 'bdg barat')]
    if re.search(r'bandung', row):
        return row, [(1.1, 'bandung')]
    if re.search(r'(bdg)', row):
        return row, [(1.1, 'bdg')]
    if re.search(r'badung', row):
        return row, [(1.1, 'badung')]

    #char by all
    match = []
    for loc_char in row:
        if loc_char not in city_dictionary['full_text']:
            continue

        for possible_loc in city_dictionary['full_text'][loc_char]:
            score = jaccard_similarity(possible_loc, row)
            if possible_loc in row:
                score += 0.01

            if score >= cba_threshold:
                match.append((score, possible_loc, 'cba'))

    # word by word

    for loc_word in row.split(" "):
        if len(loc_word) == 0:
            continue
        if loc_word[0] not in city_dictionary['full_text']:
            continue

        for possible_loc in city_dictionary['full_text'][loc_word[0]]:
            score = jaccard_similarity(possible_loc, loc_word)
            if possible_loc in loc_word:
                score += 0.01

            if score >= threshold:
                match.append((score,possible_loc, 'wbw')) # change loc_word to possible_loc for full enhancement
    # all by all
    if row[0] in city_dictionary['full_text']:
        for possible_loc in city_dictionary['full_text'][row[0]]:
            score = jaccard_similarity(possible_loc, row)
            if score >= all_text_threshold:
                match.append((1.01*score, possible_loc ,'aba')) # change row to possible_loc for full enhancement

    return row, sorted(match, key=lambda x: x[0], reverse=True)

def extract_enhanced_location(row):
    enhanced = enhance_location_from_dict(row)
    print(enhanced)

    if len(enhanced[1]) == 0:
        return None
    if len(enhanced[1][0]) == 0:
        return None

    return enhanced[1][0][1]

# Print entities
for ent in doc.ents:
    if ent.label_== "LOC":
        print("Augmented:", extract_enhanced_location(ent.text))


('jakarta slt', [(0.808, 'jakarta selatan', 'aba'), (0.8, 'jakarta selatan', 'cba'), (0.707, 'jakarta pusat', 'aba'), (0.7, 'jakarta pusat', 'cba'), (0.6666666666666666, 'jakarta barat', 'cba'), (0.6666666666666666, 'jakarta utara', 'cba'), (0.625, 'takalar', 'cba'), (0.625, 'takalar', 'cba'), (0.6, 'kolaka utara', 'cba')])
Augmented: jakarta selatan


# location dict

In [21]:
cities_regencies = pl.read_csv('data/regencies.csv', new_columns=['id', 'id_provinces', 'name'])

In [22]:
cities_regencies

id,id_provinces,name
i64,i64,str
1102,11,"""KABUPATEN ACEH SINGKIL"""
1103,11,"""KABUPATEN ACEH SELATAN"""
1104,11,"""KABUPATEN ACEH TENGGARA"""
1105,11,"""KABUPATEN ACEH TIMUR"""
1106,11,"""KABUPATEN ACEH TENGAH"""
…,…,…
9433,94,"""KABUPATEN PUNCAK"""
9434,94,"""KABUPATEN DOGIYAI"""
9435,94,"""KABUPATEN INTAN JAYA"""
9436,94,"""KABUPATEN DEIYAI"""


In [23]:
# ngram
def get_word_ngram(text, n):
    words = text.split()
    if len(words) < n:
        return []
        
    return [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]

def get_word_ngram_from_list(wordlist, n):
    if len(wordlist) < n:
        return []
        
    return [tuple(wordlist[i:i+n]) for i in range(len(wordlist) - n + 1)]

def get_char_ngrams(text, n):
    """Generate character n-grams from text."""
    # Convert text to lowercase and remove spaces for character-level comparison
    text = text.lower().replace(" ", "")
    return [text[i:i+n] for i in range(len(text) - n + 1)]

In [58]:
cities_regencies = cities_regencies.with_columns(
    pl.col('name').str.replace(r'(KABUPATEN|KOTA)','').str.to_lowercase().str.strip_chars().alias('name_clean')
)

In [59]:
cities_regencies

id,id_provinces,name,name_clean,unigram,bigram,trigram
i64,i64,str,str,object,object,object
1102,11,"""KABUPATEN ACEH SINGKIL""","""aceh singkil""","['a', 'c', 'e', 'h', 's', 'i', 'n', 'g', 'k', 'i', 'l']","['ac', 'ce', 'eh', 'hs', 'si', 'in', 'ng', 'gk', 'ki', 'il']","['ace', 'ceh', 'ehs', 'hsi', 'sin', 'ing', 'ngk', 'gki', 'kil']"
1103,11,"""KABUPATEN ACEH SELATAN""","""aceh selatan""","['a', 'c', 'e', 'h', 's', 'e', 'l', 'a', 't', 'a', 'n']","['ac', 'ce', 'eh', 'hs', 'se', 'el', 'la', 'at', 'ta', 'an']","['ace', 'ceh', 'ehs', 'hse', 'sel', 'ela', 'lat', 'ata', 'tan']"
1104,11,"""KABUPATEN ACEH TENGGARA""","""aceh tenggara""","['a', 'c', 'e', 'h', 't', 'e', 'n', 'g', 'g', 'a', 'r', 'a']","['ac', 'ce', 'eh', 'ht', 'te', 'en', 'ng', 'gg', 'ga', 'ar', 'ra']","['ace', 'ceh', 'eht', 'hte', 'ten', 'eng', 'ngg', 'gga', 'gar', 'ara']"
1105,11,"""KABUPATEN ACEH TIMUR""","""aceh timur""","['a', 'c', 'e', 'h', 't', 'i', 'm', 'u', 'r']","['ac', 'ce', 'eh', 'ht', 'ti', 'im', 'mu', 'ur']","['ace', 'ceh', 'eht', 'hti', 'tim', 'imu', 'mur']"
1106,11,"""KABUPATEN ACEH TENGAH""","""aceh tengah""","['a', 'c', 'e', 'h', 't', 'e', 'n', 'g', 'a', 'h']","['ac', 'ce', 'eh', 'ht', 'te', 'en', 'ng', 'ga', 'ah']","['ace', 'ceh', 'eht', 'hte', 'ten', 'eng', 'nga', 'gah']"
…,…,…,…,…,…,…
9433,94,"""KABUPATEN PUNCAK""","""puncak""","['p', 'u', 'n', 'c', 'a', 'k']","['pu', 'un', 'nc', 'ca', 'ak']","['pun', 'unc', 'nca', 'cak']"
9434,94,"""KABUPATEN DOGIYAI""","""dogiyai""","['d', 'o', 'g', 'i', 'y', 'a', 'i']","['do', 'og', 'gi', 'iy', 'ya', 'ai']","['dog', 'ogi', 'giy', 'iya', 'yai']"
9435,94,"""KABUPATEN INTAN JAYA""","""intan jaya""","['i', 'n', 't', 'a', 'n', 'j', 'a', 'y', 'a']","['in', 'nt', 'ta', 'an', 'nj', 'ja', 'ay', 'ya']","['int', 'nta', 'tan', 'anj', 'nja', 'jay', 'aya']"
9436,94,"""KABUPATEN DEIYAI""","""deiyai""","['d', 'e', 'i', 'y', 'a', 'i']","['de', 'ei', 'iy', 'ya', 'ai']","['dei', 'eiy', 'iya', 'yai']"


In [60]:
indonesian_cities = cities_regencies['name_clean'].to_list()

In [105]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
# Approach 1: Character-level n-gram embeddings
def create_char_ngram_embeddings(cities_list, ngram_range=(2, 3)):
    """Create character-level n-gram embeddings for city names"""
    # Initialize vectorizer for character n-grams
    vectorizer = CountVectorizer(analyzer='char', ngram_range=ngram_range)
    
    # Fit and transform the city names
    embeddings = vectorizer.fit_transform(cities_list)
    
    # Get feature names for debugging
    feature_names = vectorizer.get_feature_names_out()
    
    return embeddings.toarray(), feature_names

In [63]:
char_ngram_embeddings, ngram_features = create_char_ngram_embeddings(indonesian_cities)

In [66]:
ngram_features.shape

(1365,)

In [69]:
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
    
# Fit and transform the city names
embeddings = vectorizer.fit_transform(indonesian_cities)
    
# Get feature names for debugging
feature_names = vectorizer.get_feature_names_out()

In [217]:
query_vector = vectorizer.transform(["sudirbanyumas"])

In [218]:
query_vector

<1x1365 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [219]:
%%time
similarities = cosine_similarity(query_vector, embeddings)

CPU times: user 0 ns, sys: 2.72 ms, total: 2.72 ms
Wall time: 1.9 ms


In [220]:
similarities.shape

(1, 513)

In [221]:
np.dot(query_vector, embeddings.T)

<1x513 sparse matrix of type '<class 'numpy.int64'>'
	with 326 stored elements in Compressed Sparse Row format>

In [222]:
query_vector.shape

(1, 1365)

In [223]:
embeddings.shape

(513, 1365)

In [224]:
top_indices = np.where(similarities >= 0.3)

In [227]:
top_indices

(array([0, 0, 0, 0, 0]), array([102, 103, 187, 235, 352]))

In [197]:
top_indices = np.where(similarities >= 0.3)
top_indices = sorted(top_indices, key=lambda i: similarities[i], reverse=True)[:5]
top_indices

[array([102, 103, 187, 235, 352])]

In [189]:
[indonesian_cities[x] for x in top_indices]

['banyumas', 'banyu asin', 'banyuwangi', 'musi banyuasin', 'banjarmasin']

In [174]:
class NGramFuzzyMatcher:
    def __init__(self, reference_list, ngram_range=(2, 3)):
        self.vectorizer = CountVectorizer(analyzer='char', ngram_range=ngram_range)
        self.embeddings = self.vectorizer.fit_transform(reference_list)
        self.feature_names = self.vectorizer.get_feature_names_out()
        self.reference_list = reference_list
        
    def query(self, query, top_n = 1, threshold = 0.5):
        query_vector = self.vectorizer.transform(query)
        similarities = cosine_similarity(query_vector, self.embeddings)
        top_indices = np.where(similarities >= threshold)
        top_indices = sorted(top_indices, key=lambda i: similarities[i], reverse=True)[:top_n]
        return [self.reference_list[x] for x in top_indices]
        

In [175]:
matcher = NGramFuzzyMatcher(indonesian_cities, ngram_range=(1,3))

In [178]:
matcher.query(["jaksel"], top_n = 3, threshold = 0.1)

['jakarta selatan', 'seluma', 'solok selatan']