In [99]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf

In [100]:
##### Limit GPU for training ###
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
### read hopital names from datasets ###
df = pd.read_csv(r'hospital_v3.csv')

In [4]:
df.head()

Unnamed: 0,name
0,คลินิกกรุงเทพพัทยาสาขาจอมเทียน
1,คลินิกกรุงเทพระยองสาขาบ่อวิน
2,คลินิกกรุงเทพระยองสาขาบ้านฉาง
3,คลินิกกรุงเทพระยองสาขาปลวกแดง
4,คลินิกไทยอินเตอร์การแพทย์(เกาะพงัน)


In [11]:
df_addv3 = pd.read_csv(r'hospital_v3add.csv')

In [12]:
df_addv3.head()

Unnamed: 0,name
0,โรงพยาบาลสินแพทย์ รามอินทรา
1,โรงพยาบาลสินแพทย์ ลำลูกกา
2,โรงพยาบาลอินทรารัตน์
3,โรงพยาบาลกล้วยน้ำไท
4,โรงพยาบาลสมเด็จเจ้าพระยา


In [25]:
df_new = pd.concat([df,df_addv3],axis=0)

In [27]:
df_new.to_csv('hospital_v4.csv',index=False)

In [26]:
df_new

Unnamed: 0,name
0,คลินิกกรุงเทพพัทยาสาขาจอมเทียน
1,คลินิกกรุงเทพระยองสาขาบ่อวิน
2,คลินิกกรุงเทพระยองสาขาบ้านฉาง
3,คลินิกกรุงเทพระยองสาขาปลวกแดง
4,คลินิกไทยอินเตอร์การแพทย์(เกาะพงัน)
...,...
84,โรงพยาบาลทหารผ่านศึก
85,โรงพยาบาลอานันทมหิดล
86,ศูนย์การแพทย์กาญจนาภิเษก
87,โรงพยาบาลตา หู คอ จมูก


In [5]:
from pythainlp import thai_letters

In [6]:
label_map_0 = ["0","1","2","3","4","5","6","7","8","9","-",".","/","(",")","\"","&"]
label_map_1 = list(thai_letters)
char_set = label_map_0+label_map_1

In [7]:
len(char_set)

90

In [8]:
char2int = { char_set[x]:x for x in range(len(char_set)) }
int2char = { char2int[x]:x for x in char_set }

In [9]:
#thresh - 0 to 1
import random
def gen_gibberish(line,thresh=0.15):
    text_true = line
    times = int(random.randrange(1,len(line)) * thresh)
    '''
    Types of replacement:
        1.Delete random character.
        2.Add random character.
        3.Replace a character.
        4.Combination?
    '''
    while times!=0:
        # try to gen noise length times...
        times-=1
        val = random.randrange(0,10)
        if val <= 5:
            #get random index
            val = random.randrange(0,10)
            index = random.randrange(2,len(line))
            if val <= 3 :
                #delete character
                line = line[:index]+line[index+1:]
            else:
                #add character
                insert_index = random.randrange(0,len(char_set))
                line = line[:index] + char_set[insert_index] + line[index:]
        else:
            index = random.randrange(0,len(char_set))
            replace_index = random.randrange(2,len(line))
            line = line[:replace_index] + char_set[index] + line[replace_index+1:]
    return line,text_true

In [53]:
hos_list = list(df_new['name'])
len(hos_list)

1375

In [55]:
b = hos_list[:46]
hos_list[46]

'บ้าน'

In [57]:
b2 = hos_list[47:]
len(b2)

1328

In [58]:
hos_list2= b+b2

In [59]:
len(hos_list2)

1374

In [60]:
if 'บ้าน' in hos_list2:
    print('vao')

In [71]:
gen_data =[]
true_data =[]
for i in range(8):
    for line in hos_list:
        a,b = gen_gibberish(line)
        gen_data.append(a)
        true_data.append(b)

In [72]:
len(gen_data)

11000

In [17]:
'''
a1 = ['บาน']
a2 = ['บ้า']
a3 = ['บ้านน']
a4 = ['ป้าน']
a5 =['ป้านน']
a6 =['บ็าน']
a7= ['ข้าน']
a8 = ['บานน']
a9 = ['ช้าน']
a10 = ['บ๊าน']
'''

In [73]:
a1 = ['ขาน']
a2 = ['ข้า']
a3 = ['บ้น']
a4 = ['ป้า']
a5 =['ช้านน']
a6 =['บ๙าน']
a7= ['ข้าน']
a8 = ['ัาน']
a9 = ['บ้าง']
a10 = ['ปาน']

In [74]:

gen_data = gen_data+a1+a2+a3+a4+a5+a6+a7+a8+a9+a10
true_data = true_data +['บ้าน']*10

In [75]:
'''
gen_data = gen_data+hos_list
true_data = true_data+hos_list
'''

'\ngen_data = gen_data+hos_list\ntrue_data = true_data+hos_list\n'

In [76]:
# using naive method 
# to remove duplicated  
# from list  
res_gen = []
res_true =[] 
for i,key in enumerate(gen_data): 
    if not key  in res_gen: 
        res_gen.append(key)
        res_true.append(true_data[i])

In [77]:
len(res_gen)

8378

In [89]:
res_gen[100],res_true[100]

('โรงพยาบาลกำแพงเพชร', 'โรงพยาบาลกำแพงเพชร')

In [23]:
k = int(len(res_gen)*0.05)

In [24]:
k

7825

In [90]:
'''
test_x = res_gen[len(res_gen)-k:]
test_y=res_true[len(res_true)-k:]
'''
test_x =res_gen
test_y =res_true

In [79]:
len(test_x),len(test_y)

(8378, 8378)

In [32]:
'''
train_x = res_gen[:len(res_gen)-k]
train_y = res_true[:len(res_true)-k]
'''
train_x = res_gen
train_y =res_true

In [33]:
len(train_x),len(train_y)

(11825, 11825)

In [80]:
df6 = pd.read_csv('hospital_augment_train6.csv')

In [81]:
df6_trainx = list(df6['gen_data'])
df6_trainy =list(df6['true_data'])

In [82]:
len(df6_trainx),len(df6_trainy)

(78352, 78352)

In [91]:
test_x2 =[]
test_y2 =[]
for i,val in enumerate(test_x):
    if not val in df6_trainx:
        test_x2.append(test_x[i])
        test_y2.append(test_y[i])

In [92]:
len(test_x2),len(test_y2)

(6475, 6475)

In [95]:
gen_df_test = pd.DataFrame(test_x2)
true_df_test=pd.DataFrame(test_y2)

In [94]:
test_x2[100],test_y2[100]

('โรบพยา4าลเขาย้อย', 'โรงพยาบาลเขาย้อย')

In [96]:
gen_df_test.columns =['gen_data']
true_df_test.columns =['true_data']

In [97]:
df_col_merged_test=pd.concat([gen_df_test,true_df_test],axis=1)

In [98]:
df_col_merged_test.to_csv('hospital_augment_test6.csv',index=False)

In [39]:
df_wrong6 = pd.read_csv('hospital_wrong6.csv')

In [40]:
df6_trainx = list(df_wrong6['gen_data'])
df6_trainy =list(df_wrong6['true_data'])

In [45]:
df6_trainx[19],df6_trainy[19]

('โรงพยาบาลฅูหลวง', 'โรงพยาบาลภูหลวง')

In [46]:
for i,val in enumerate(df6_trainx):
    if not val in train_x:
        train_x.append(df6_trainx[i])
        train_y.append(df6_trainy[i])

In [48]:
len(train_x)

78352

In [49]:
gen_df_train = pd.DataFrame(train_x)
true_df_train = pd.DataFrame(train_y)
#gen_df_test =pd.DataFrame(test_x)
#true_df_test =pd.DataFrame(test_y)

In [50]:
gen_df_train.columns=['gen_data']
true_df_train.columns=['true_data']
#gen_df_test.columns =['gen_data']
#true_df_test.columns =['true_data']

In [51]:
df_col_merged_train =pd.concat([gen_df_train,true_df_train],axis=1)
# =pd.concat([gen_df_test,true_df_test],axis=1)

In [52]:
df_col_merged_train.to_csv('hospital_augment_train6.csv',index=False)
#df_col_merged_test.to_csv('hospital_augment_test5.csv',index=False)

In [None]:
len(res_gen)
len(res_true)

In [78]:
# using naive method 
# to remove duplicated  
# from list  
res_gen = []
res_true =[] 
for i,key in enumerate(gen_data): 
    if key not in res_gen: 
        res_gen.append(key)
        res_true.append(true_data[i])

In [79]:
len(res_gen)
len(res_true)

708832

In [89]:
import random

In [93]:
random.shuffle(res_gen)

In [95]:
random.shuffle(res_true)

In [98]:
train_data_gen =[]
train_data_true =[]
val_data_gen =[]
val_data_true =[]
for i in range(0,len(res_gen),10):
    val_data_gen.append(res_gen[i])
    val_data_true.append(res_true[i])

In [99]:
len(val_data_gen)

70884

In [100]:
for i,key in enumerate(res_gen):
    if not key in val_data_gen:
        train_data_gen.append(res_gen[i])
        train_data_true.append(res_true[i])


In [102]:
test_data_gen =[]
test_data_true=[]
for i in range(0,len(train_data_gen),10):
    test_data_gen.append(train_data_gen[i])
    test_data_true.append(train_data_true[i])

In [104]:
train_gen =[]
train_true = []
for i,key in enumerate(train_data_gen):
    if not key in test_data_gen:
        train_gen.append(train_data_gen[i])
        train_true.append(train_data_true[i])

In [105]:
len(train_gen),len(val_data_gen),len(test_data_gen)

(574153, 70884, 63795)

In [106]:
train_gen_out = train_gen + hos_list
train_true_out = train_true +hos_list
val_gen_out = val_data_gen
val_true_out = val_data_true
test_gen_out = test_data_gen
test_true_out = test_data_true

In [107]:
gen_df_train = pd.DataFrame(train_gen_out)
true_df_train = pd.DataFrame(train_true_out)
gen_df_val =pd.DataFrame(val_gen_out)
true_df_val =pd.DataFrame(val_true_out)
gen_df_test =pd.DataFrame(test_gen_out)
true_df_test =pd.DataFrame(test_true_out)

In [108]:
gen_df_train.columns=['gen_data']
true_df_train.columns=['true_data']
gen_df_val.columns =['gen_data']
true_df_val.columns =['true_data']
gen_df_test.columns =['gen_data']
true_df_test.columns =['true_data']

In [109]:
df_col_merged_train =pd.concat([gen_df_train,true_df_train],axis=1)

In [110]:
df_col_merged_val =pd.concat([gen_df_val,true_df_val],axis=1)
df_col_merged_test =pd.concat([gen_df_test,true_df_test],axis=1)

In [111]:
df_col_merged_train.to_csv('hospital_augment_train4.csv',index=False)
df_col_merged_val.to_csv('hospital_augment_val4.csv',index=False)
df_col_merged_test.to_csv('hospital_augment_test4.csv',index=False)

In [13]:
import random

In [31]:
def gen_delete(text,per=0.15):
    e = np.random.uniform() 
    if e <0.1:
        return text
    r = np.random.randint(len(list(text)))
    text_lst = list(text)
    del text_lst[r]
    return ''.join(text_lst)

In [32]:
def gen_insert(text,per=0.15):
    e = np.random.uniform() 
    if e <0.1:
        return text
    index = np.random.randint(len(list(text)))
    char = random.choice(char_set)
    text_list = list(text)
    text_list.insert(index, char)
    return ''.join(text_list)

In [33]:
def gen_substitution(text,per=0.15):
    e = np.random.uniform() 
    if e <0.1:
        return text
    index = np.random.randint(len(list(text)))
    char = random.choice(char_set)
    text_list = list(text)
    text_list[index] = char
    return ''.join(text_list)

In [34]:
delete_hos = []
delete_hos_true =[]
for i in range(10):
    for p in hos_list:
        delete_hos.append(gen_delete(p))
        delete_hos_true.append(p)

In [35]:
insert_hos = []
insert_hos_true =[]
for i in range(10):
    for p in hos_list:
        insert_hos.append(gen_insert(p))
        insert_hos_true.append(p)

In [36]:
sub_hos = []
sub_hos_true =[]
for i in range(10):
    for p in hos_list:
        sub_hos.append(gen_substitution(p))
        sub_hos_true.append(p)

In [37]:
gen_data2 = delete_hos+ insert_hos+sub_hos
true_data2 = delete_hos_true + insert_hos_true+sub_hos_true

In [38]:
len(gen_data2),len(true_data2)

(38700, 38700)

In [55]:
len(res_gen),len(res_true)

(708628, 708628)

In [56]:
ture_idx = len(res_true) -1290

In [57]:
gen_data = res_gen[:ture_idx]
true_dat = res_true[:ture_idx]

In [68]:
gen_data[2000],true_data[2000]

('โร8พยาบาลร่มฉัตร', 'โรงพยาบาลโพธาราม')

In [65]:
train_data_gen =[]
train_data_true =[]
val_data_gen =[]
val_data_true =[]
for i in range(0,len(gen_data),15):
    val_data_gen.append(gen_data[i])
    val_data_true.append(true_data[i])

In [67]:
len(val_data_gen)

70734

In [46]:
for i,key in enumerate(res_gen):
    if not key in val_data_true:
        train_data_gen.append(key)
        train_data_true.append(res_true[i])

In [47]:
len(train_data_gen),len(val_data_gen),len(train_data_true),len(val_data_true)

(32369, 3353, 32369, 3353)

In [48]:
train_data_gen[500],train_data_true[500]

('โรงพาบาลบ้านลาด', 'โรงพยาบาลบ้านลาด')

In [49]:
test_data_gen =[]
test_data_true =[]
for i in range(0,len(train_data_gen),12):
    test_data_gen.append(train_data_gen[i])
    test_data_true.append(train_data_true[i])

In [50]:
train_data_gen_res =[]
train_data_true_res =[]
for i,key in enumerate(train_data_gen):
    if not key in test_data_gen:
        train_data_gen_res.append(train_data_gen[i])
        train_data_true_res.append(train_data_true[i])

In [51]:
train_data_gen[0],train_data_true[0]

('คลินิกรุงเทพพัทยาสาขาจอมเทียน', 'คลินิกกรุงเทพพัทยาสาขาจอมเทียน')

In [52]:
len(train_data_true_res),len(val_data_gen),len(test_data_gen)

(29671, 3353, 2698)

In [53]:
train_gen = train_data_gen_res + hos_list
train_true = train_data_true_res +hos_list
val_gen = val_data_gen
val_true = val_data_true
test_gen = test_data_gen
test_true = test_data_true

In [54]:
gen_df_train = pd.DataFrame(train_gen)
true_df_train = pd.DataFrame(train_true)
gen_df_val =pd.DataFrame(val_gen)
true_df_val =pd.DataFrame(val_true)
gen_df_test =pd.DataFrame(test_data_gen)
true_df_test =pd.DataFrame(test_data_true)

In [55]:
gen_df_train.columns=['gen_data']
true_df_train.columns=['true_data']
gen_df_val.columns =['gen_data']
true_df_val.columns =['true_data']
gen_df_test.columns =['gen_data']
true_df_test.columns =['true_data']

In [56]:
len(gen_df_train),len(gen_df_val),len(gen_df_test)

(30961, 3353, 2698)

In [57]:
df_col_merged_train =pd.concat([gen_df_train,true_df_train],axis=1)

In [58]:
df_col_merged_val =pd.concat([gen_df_val,true_df_val],axis=1)
df_col_merged_test =pd.concat([gen_df_test,true_df_test],axis=1)

In [59]:
df_col_merged_train.to_csv('hospital_augment_train3.csv',index=False)
df_col_merged_val.to_csv('hospital_augment_val3.csv',index=False)
df_col_merged_test.to_csv('hospital_augment_test3.csv',index=False)

In [87]:
df_col_merged2.to_csv('hospital_augment2_val.csv',index=False)

In [167]:
len(sub_hos)

12900

In [109]:
df_test = pd.read_csv('hospital_augment_test.csv')

In [110]:
df_test_gen = list(df_test['gen_data'])
df_test_true = list(df

In [142]:
df_test_read = pd.read_csv('hospital_augment_test2.csv')

In [148]:
list_test_gen = list(df_test_read['gen_data'])
list_test_true = list(df_test_read['true_data'])

In [149]:
test_gen_res =[]
test_true_res=[]
for i in range(0,len(list_test_gen),2):
    test_gen_res.append(list_test_gen[i])
    test_true_res.append(list_test_true[i])

In [150]:
gen_df_test2=pd.DataFrame(test_gen_res)
true_df_test2 =pd.DataFrame(test_true_res)

In [151]:
len(gen_df_test2)

3831

In [152]:
gen_df_test2.columns =['gen_data']
true_df_test2.columns =['true_data']

In [153]:
df_col_merged_test =pd.concat([gen_df_test2,true_df_test2],axis=1)

In [154]:
df_col_merged_test.to_csv('hospital_augment_test2.csv',index=False)