In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('mock_name_address.csv')

# Clean text + Extract some information

This section aims to clean dirty texts and extract elements including 
- Zip Code
- House Number
- Moo (Village number)
- list_other : a list that includes district, province, subdistrict and others but not yet extracted

In [23]:
def load_thai_address():
    
    df = pd.read_csv('mock_name_address.csv')
    
    return df

def clean_text(text):
    
    # Clean text using regular expression
    
    pattern_soi = 'ซ\.|ซอย'
    pattern_thanon = 'ถ\.|ถนน'
    pattern_tambon = 'ต\.|ตำบล|แขวง'
    pattern_amphoe = 'อ\.|อำเภอ|เขต'
    pattern_changwat = 'จ\.|จังหวัด'
    pattern_bkk = 'กทม |กทม\. |กรุงเทพ |กรุงเทพฯ |กรุงเทพมหานครฯ '
    
    text = re.sub(pattern_soi, '', text)
    text = re.sub(pattern_thanon, '', text)
    text = re.sub(pattern_tambon, '', text)
    text = re.sub(pattern_amphoe, '', text)
    text = re.sub(pattern_changwat, '', text)
    text = re.sub(pattern_bkk, 'กรุงเทพมหานคร ', text)
    
    return text

def find_dict_pattern(text):
    
    # Clean moo and keep it
    
    pattern_moo = '(หมู่ \d+ |ม\. \d+)'
    moo = None
    if re.findall(pattern_moo, text):
        moo = re.findall(pattern_moo, text)[0].strip().split()
        moo = 'หมู่' + moo[1]
        text = re.sub(pattern_moo,  '', text)
        
    
    # clean house number and keep it
    
    pattern_house_number = '(\d+\/\d+ |\d{1,3} )'
    house_number = re.findall(pattern_house_number, text)[0][:-1]
    text = re.sub(pattern_house_number, '', text)
    
    # clean zipcode and keep it
    
    zip_code = None
    pattern_zip_code = '\d{5}'
    
    if re.findall(pattern_zip_code, text):
        zip_code = re.findall(pattern_zip_code, text)[0]
    
    # save everything into a dict
    
    dict_all = {}
    
    dict_all['Zip Code'] = zip_code
    dict_all['Moo'] = moo
    dict_all['House Number'] = house_number
    
    text = re.sub('.+หมู่\d+ ', '', text)
    text = re.sub(' \d{5}', '', text)
    dict_all['list_other'] = text.split()
    
    return dict_all

    
def run_all(text):
    
    text = clean_text(text)
    info_dict = find_dict_pattern(text)
    
    return info_dict

    

In [24]:
for i in df['Address'].head(10).tolist():
    
    print(i)
    print(run_all(i))
    print('-----------')
    print('')

61 ถนนแนวพญา ต.เขาขาว อ.วังยาง จ.หนองคาย
{'Zip Code': None, 'Moo': None, 'House Number': '61', 'list_other': ['แนวพญา', 'เขาขาว', 'วังยาง', 'หนองคาย']}
-----------

854/11 ถนนดีตพันธุ์ ต.โนนแดงเหนือ อ.ศรีสงคราม จ.เพชรบูรณ์ 25000
{'Zip Code': '25000', 'Moo': None, 'House Number': '854/11', 'list_other': ['ดีตพันธุ์', 'โนนแดงเหนือ', 'ศรีสงคราม', 'เพชรบูรณ์']}
-----------

322/0 ถนนนาคะนคร ต.อิตื้อ อ.สามโคก จ.ยโสธร 25630
{'Zip Code': '25630', 'Moo': None, 'House Number': '322/0', 'list_other': ['นาคะนคร', 'อิตื้อ', 'สามโคก', 'ยโสธร']}
-----------

944/57 หมู่บ้านธมลพรรณ โพนสวรรค์ กรุงเทพฯ 57200
{'Zip Code': '57200', 'Moo': None, 'House Number': '944/57', 'list_other': ['หมู่บ้านธมลพรรณ', 'โพนสวรรค์', 'กรุงเทพมหานคร']}
-----------

3 ถ.เขียวอ่อน อ.เฉลิมพระเกียรติ ลพบุรี 19320
{'Zip Code': '19320', 'Moo': None, 'House Number': '3', 'list_other': ['เขียวอ่อน', 'เฉลิมพระเกียรติ', 'ลพบุรี']}
-----------

78 หมู่ 6 ถ.ตั้งเผ่า ตำบลปากคมใต้ อำเภอซับใหญ่ กำแพงเพชร 17150
{'Zip Code': '17150', 'Moo'

# Autocorrection and search for true elements (Tambon, Amphoe, Chanwat)

- This section aims to extract elements from list_other from the last section. First, we need to figure out what province in the list_other is then find district and subdistrict consequently. 

How about a **MISSPELLED** word? Do we need to cure the problem? 

- The answer is **YES**. As we discussed that we will follow from province **>>** district **>>** subdistrict level-by-level. Using Levenshtein distance to compare a token in list_other and data from governance is very essential. 
- First, We calculated the distance of each token and every province we have then we acquired a province name which has the lowest distance in list_other. 
- Then, we popped the last token out and calculated as the same to extract district and subdistrict. 

****Example****

*list_other* = ['ทุ่งควายกิน', 'แกลข', 'ระยอง', 'อาคารสรพล']

1. Calculate every token's distance with every province name --> We acquie 'ระยอง' as a province because of zero distance.
2. Calculate the remaining token's distance with every district name --> We acquire 'แกลง' as a district because distance = 2 (แกลข >> แกลง Rep cost = 2)
3. Calculate the remaining token's distance with every district name --> We acquire 'ทุ่งควายกิน' as a subdistrict because of zero distance.
4. The remaining tokens (['อาคารสรพล'] will be left as the other information (such as buiding name etc.).
5. The result is
    
    {
        'Sub District' : 'ทุ่งควายกิน',
        'District' : 'แกลง',
        'Province' : 'ระยอง',
        'Other' : 'อาคารสรพล',
    }
    
    
6. The result will be merged with the result from last section (Zipcode, Moo and House Number)
    

In [25]:
def levenshtein(text1, text2):
    
    # Calculate Levenshtein distance between text1 and text2
    
    len1 = len(text1)
    len2 = len(text2)
    
    matrix = np.zeros((len1 + 1, len2 +1))
    
    first_horizontal = [i for i in range(len2+1)]
    first_vertical = [i for i in range(len1+1)]
    
    matrix[0, :] = first_horizontal
    matrix[:, 0] = first_vertical

    del_cost = 1
    ins_cost = 1
    rep_cost = 2
    
    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            
            cost_1 = matrix[i-1, j] + del_cost
            cost_2 = matrix[i, j-1] + ins_cost
            cost_3 = matrix[i-1, j-1]
            if text1[i-1] != text2[j-1]: 
                cost_3 = matrix[i-1, j-1] + rep_cost
        
            matrix[i, j] = min(cost_1, cost_2, cost_3)
    
    min_edit_distance = matrix[len1, len2]

    return min_edit_distance

In [33]:
def load_tambon_data(file_name):
    
    tambon_df = pd.read_excel(file_name)
    tambon_df['CHANGWAT_T'] = tambon_df['CHANGWAT_T'].replace({'จ\. ' : ''}, regex = True)
    tambon_df['AMPHOE_T'] = tambon_df['AMPHOE_T'].replace({'อ\. ' : ''}, regex = True)
    tambon_df['TAMBON_T'] = tambon_df['TAMBON_T'].replace({'ต\. ' : ''}, regex = True)
    
    return tambon_df
    

def extract_changwat(list_other, changwats):
    
    min_ = float('inf')
    result_ch = ''
    index_to_pop = 0
    
    for i in range(len(list_other)):

        for ch in changwats:
            
            text = list_other[i]
            min_edit_distance = levenshtein(text, ch)
            
            if min_edit_distance < min_:
                min_ = min_edit_distance
                index_to_pop = i
                result_ch = ch
    
    del list_other[index_to_pop]

    
    return list_other, result_ch

def extract_amphoe(list_other, amphoes):
    
    min_ = float('inf')
    result_ap = ''
    index_to_pop = 0
    
    for i in range(len(list_other)):

        for ap in amphoes:
            
            text = list_other[i]
            min_edit_distance = levenshtein(text, ap)
            
            if min_edit_distance < min_:
                min_ = min_edit_distance
                index_to_pop = i
                result_ap = ap
    
    del list_other[index_to_pop]
    
    
    return list_other, result_ap

def extract_tambon(list_other, tambons):
    
    min_ = float('inf')
    result_tb = ''
    index_to_pop = 0
    
    for i in range(len(list_other)):

        for tb in tambons:
            
            text = list_other[i]
            min_edit_distance = levenshtein(text, tb)
            
            if min_edit_distance < min_:
                min_ = min_edit_distance
                index_to_pop = i
                result_tb = tb
    
    
    del list_other[index_to_pop]
    
    
    return list_other, result_tb
# extract_tambon(ex, tambon_df[(tambon_df['CHANGWAT_T'] == 'จ. ระยอง') &
#                              (tambon_df['AMPHOE_T'] == 'อ. แกลง')]['TAMBON_T'].replace({'ต\. ' : '', 'ระยอง' : ''}, regex = True).unique())        

def extract_all_v1(text, tambon_df):
    
    all_ = run_all(text)
    list_other = all_['list_other']
    
    changwat_list = tambon_df['CHANGWAT_T'].unique()
    list_other, changwat = extract_changwat(list_other, changwat_list)
    
    amphoe_list = tambon_df[tambon_df['CHANGWAT_T'] == changwat]['AMPHOE_T']
    list_other, amphoe = extract_amphoe(list_other, amphoe_list)
    
    tambon_list = tambon_df[(tambon_df['CHANGWAT_T'] == changwat) & (tambon_df['AMPHOE_T'] == amphoe)]['TAMBON_T']
    list_other, tambon = extract_tambon(list_other, tambon_list)
    
    other_component = ' '.join(list_other)
    
    all_.pop('list_other')
    all_['Sub District'] = tambon
    all_['District'] = amphoe
    all_['Province'] = changwat
    all_['Other'] = other_component
    
    return all_


In [34]:
tambon_df = load_tambon_data(r'thai_tambon_data.xlsx')

ex1 = '61 หมู่ 5 ถ.มิ่งขวัญ ตำบลเว่อเล็ก อำเภอเทพสถิต ปัตตานี 55850'
ex2 = '90 หมู่ 1 อาคารสรพล หมู่บ้านเขาดิน ตำบลทุ่งควายกิน อำเภอแกลง จังหวัดระยอข'

print(ex1)
print('----')
all_1 = extract_all_v1(ex1, tambon_df)
print(all_1)
print('------------------------------')

print(ex2)
print('----')
all_2 = extract_all_v1(ex2, tambon_df)
print(all_2)

61 หมู่ 5 ถ.มิ่งขวัญ ตำบลเว่อเล็ก อำเภอเทพสถิต ปัตตานี 55850
----
{'Zip Code': '55850', 'Moo': 'หมู่5', 'House Number': '61', 'Sub District': 'ตอหลัง', 'District': 'ยะหริ่ง', 'Province': 'ปัตตานี', 'Other': 'เทพสถิต'}
------------------------------
90 หมู่ 1 อาคารสรพล หมู่บ้านเขาดิน ตำบลทุ่งควายกิน อำเภอแกลง จังหวัดระยอข
----
{'Zip Code': None, 'Moo': 'หมู่1', 'House Number': '90', 'Sub District': 'ทุ่งควายกิน', 'District': 'แกลง', 'Province': 'ระยอง', 'Other': 'อาคารสรพล หมู่บ้านเขาดิน'}


In [28]:
all_1

{'Zip Code': '55850',
 'Moo': 'หมู่5',
 'House Number': '61',
 'Sub District': 'ตอหลัง',
 'District': 'ยะหริ่ง',
 'Province': 'ปัตตานี',
 'Other': 'เทพสถิต'}