In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import tensorflow as tf

In [2]:
##### Limit GPU for training ###
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
# File loading
### read hopital names from datasets ###
df = pd.read_csv(r'../datasets/hospital_augment_train7.csv')

In [4]:
df.head()

Unnamed: 0,gen_data,true_data
0,โรงพยาบลสินแพทย์รามอินทรา,โรงพยาบาลสินแพทย์ รามอินทรา
1,โรงซพยาบาลสินแพทย์ ลำลูกกา,โรงพยาบาลสินแพทย์ ลำลูกกา
2,โรงพยาบาลอินทรารัตน์,โรงพยาบาลอินทรารัตน์
3,โรงพยาบาลก้วยน้ำไท,โรงพยาบาลกล้วยน้ำไท
4,โรงพยาบาลสมเด็จเจ้โพระยา,โรงพยาบาลสมเด็จเจ้าพระยา


In [5]:
hospital_gen_train = list(df['gen_data'])
hospital_true_train = list(df['true_data'])

In [6]:
df_test = pd.read_csv(r'../datasets/hospital_augment_test6.csv')
df_val = pd.read_csv(r'../datasets/hospital_augment_val7.csv')

In [7]:
hospital_gen_val = list(df_val['gen_data'])
hospital_true_val = list(df_val['true_data'])
hospital_gen_test = list(df_test['gen_data'])
hospital_true_test = list(df_test['true_data'])

In [8]:
len(hospital_gen_train)

85227

In [9]:
# Preprocessing
import re
def process(sent):
    sent=sent.lower()
    sent=re.sub(r'[^0-9a-zA-Z ]','',sent)
    sent=sent.replace('\n','')
    return sent 

In [10]:
from pythainlp import thai_letters

In [11]:
label_map_0 = ["0","1","2","3","4","5","6","7","8","9","-",".","/","(",")","\"","&"]
label_map_1 = list(thai_letters)

In [12]:
char_set = label_map_0+label_map_1

In [13]:
char2int = { char_set[x]:x for x in range(len(char_set)) }
int2char = { char2int[x]:x for x in char_set }

In [14]:
count = len(char_set)
codes = ['\t' ,'\n', ' ']

In [15]:
for i in range(len(codes)):
    code = codes[i]
    char2int[code]=count
    int2char[count]=code
    count+=1

In [16]:
input_texts = []
target_texts = []

In [17]:
hospital_gen_train[:10]

['โรงพยาบลสินแพทย์รามอินทรา',
 'โรงซพยาบาลสินแพทย์ ลำลูกกา',
 'โรงพยาบาลอินทรารัตน์',
 'โรงพยาบาลก้วยน้ำไท',
 'โรงพยาบาลสมเด็จเจ้โพระยา',
 'โรงพยาบาลตากสิน',
 'โรงพยาบาลนันอญ',
 'โรงพยา8าลคลอสามวา',
 'โรงพยาบลเปาโล เกษตร',
 'โรงพยาบาลบางขนเทีโน 1']

In [18]:
for i,line in enumerate(hospital_true_train):
    output_text = '\t' + line + '\n'
    input_texts.append(hospital_gen_train[i].replace('\u200b',''))
    target_texts.append(output_text.replace('\u200b',''))
print("LEN OF SAMPLES:",len(input_texts))

LEN OF SAMPLES: 85227


In [19]:
input_texts[10],target_texts[10]

('โรงพยาบาลบางปะกอก 9ออินเตร์เบั่นแนล',
 '\tโรงพยาบาลบางปะกอก 9 อินเตอร์เนชั่นแนล\n')

In [20]:
max_enc_len = max([len(x)+x.count(' ') for x in input_texts])
max_dec_len = max([len(x)+x.count(' ') for x in target_texts])
print("Max Enc Len:",max_enc_len)
print("Max Dec Len:",max_dec_len)

Max Enc Len: 78
Max Dec Len: 75


In [21]:
num_samples = len(input_texts)
encoder_input_data = np.zeros( (num_samples , max_enc_len , len(char_set)+3),dtype='float32' )
decoder_input_data = np.zeros( (num_samples , max_dec_len , len(char_set)+3),dtype='float32' )
decoder_target_data = np.zeros( (num_samples , max_dec_len , len(char_set)+3),dtype='float32' )
print("CREATED ZERO VECTORS")

CREATED ZERO VECTORS


In [22]:
encoder_input_data.shape,decoder_input_data.shape,decoder_target_data.shape

((85227, 78, 93), (85227, 75, 93), (85227, 75, 93))

In [23]:
input_texts[66],target_texts[66]

('โรงพยาบา)ลเกษมยราษฎร์รามคำแฬหง', '\tโรงพยาบาลเกษมราษฎร์รามคำแหง\n')

In [24]:
#filling in the enc,dec datas
for i,(input_text,target_text) in enumerate(zip(input_texts,target_texts)):
    #print(i)
    for t,char in enumerate(input_text):
        encoder_input_data[ i , t , char2int[char] ] = 1
    for t,char in enumerate(target_text):
        decoder_input_data[ i, t , char2int[char] ] = 1
        if t > 0 :
            decoder_target_data[ i , t-1 , char2int[char] ] = 1
print("COMPLETED...")         

COMPLETED...


In [25]:
len(input_texts),len(target_texts)

(85227, 85227)

In [26]:
val_x = []
val_y = []
for i,line in enumerate(hospital_true_val):
    output_text = '\t' + line + '\n'
    val_x.append(hospital_gen_val[i].replace('\u200b',''))
    val_y.append(output_text.replace('\u200b',''))
print("LEN OF SAMPLES:",len(val_x))

LEN OF SAMPLES: 8180


In [27]:
max_enc_len_val = max([len(x)+x.count(' ') for x in val_x])
max_dec_len_val = max([len(x)+x.count(' ') for x in val_y])
print("Max Enc Len:",max_enc_len_val)
print("Max Dec Len:",max_dec_len_val)

Max Enc Len: 75
Max Dec Len: 75


In [28]:
##### Validation Set #####
num_samples_val = len(val_x)
encoder_input_data_val = np.zeros( (num_samples_val , max_enc_len_val , len(char_set)+3),dtype='float32' )
decoder_input_data_val = np.zeros( (num_samples_val , max_dec_len_val , len(char_set)+3),dtype='float32' )
decoder_target_data_val = np.zeros( (num_samples_val , max_dec_len_val , len(char_set)+3),dtype='float32' )
print("CREATED ZERO VECTORS")

CREATED ZERO VECTORS


In [29]:
#filling in the enc,dec datas
for i,(input_text,target_text) in enumerate(zip(val_x,val_y)):
    for t,char in enumerate(input_text):
        encoder_input_data_val[ i , t , char2int[char] ] = 1
    for t,char in enumerate(target_text):
        decoder_input_data_val[ i, t , char2int[char] ] = 1
        if t > 0 :
            decoder_target_data_val[ i , t-1 , char2int[char] ] = 1
print("COMPLETED...")     

COMPLETED...


In [30]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,LSTM,Dense,Activation,Attention,Bidirectional,Concatenate
import tensorflow as tf

In [31]:
batch_size = 256
epochs = 100
latent_dim = 256

num_enc_tokens = len(char_set)+3
num_dec_tokens = len(char_set) + 3 # includes \n \t ' '
encoder_inputs = Input(shape=(None,num_enc_tokens))
encoder = Bidirectional(LSTM(latent_dim,return_state=True))
#encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)
#encoder_outputs , state_h, state_c = encoder(encoder_inputs)
#encoder_states = [state_h,state_c]
#encoder_states = [ forward_h, forward_c, backward_h, backward_c]
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

In [33]:
decoder_inputs = Input(shape=(None,num_dec_tokens))
decoder_lstm = LSTM(latent_dim*2,return_sequences=True,return_state=True)
decoder_ouputs,_,_ = decoder_lstm(decoder_inputs,initial_state = encoder_states)

In [34]:
num_dec_tokens

93

In [35]:
decoder_dense = Dense(num_dec_tokens, activation='softmax')
decoder_ouputs = decoder_dense(decoder_ouputs)

In [36]:
model = Model([encoder_inputs,decoder_inputs],decoder_ouputs)
#model = Model.add(Attention())
model.compile(optimizer='adam',loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 93)]   0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 512), (None, 716800      input_1[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None, 93)]   0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 512)          0           bidirectional[0][1]              
                                                                 bidirectional[0][3]          

In [37]:
### Model includes only true sequence ###
earlystop_callback = tf.keras.callbacks.EarlyStopping(
  monitor='loss', min_delta=0.0001,
  patience=50)

saveModel_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath= "models/bilstm_hosipital_gendata_v7.h5",
            save_best_only=True,
            monitor='loss',
            save_weights_only = True,
            verbose=1)
annealer = tf.keras.callbacks.LearningRateScheduler(lambda x: 1e-4 * 0.95 ** x)

In [38]:
hist=model.fit([encoder_input_data,decoder_input_data],decoder_target_data
         ,epochs = 200,validation_data=([encoder_input_data_val,decoder_input_data_val],decoder_target_data_val),
          batch_size = batch_size,callbacks=[earlystop_callback,annealer])

Train on 85227 samples, validate on 8180 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
 1280/85227 [..............................] - ETA: 44s - loss: 0.2445

InternalError: GPU sync failed

In [37]:
import matplotlib.pyplot as plt

In [41]:
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.grid()

NameError: name 'hist' is not defined

In [39]:
encoder_model = Model(encoder_inputs,encoder_states)

decoder_state_input_h = Input(shape=(latent_dim*2,))
decoder_state_input_c = Input(shape=(latent_dim*2,))
decoder_states_inputs = [decoder_state_input_h,decoder_state_input_c]
decoder_outputs,state_h,state_c = decoder_lstm(
        decoder_inputs,initial_state = decoder_states_inputs
)
decoder_states = [state_h,state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)
encoder_model.save('encoder_v7.h5')
decoder_model.save('decoder_v7.h5')

In [40]:
from tensorflow.keras.models import load_model

In [41]:
encoder_model = load_model('encoder_v6.h5',compile=False)

In [42]:
decoder_model = load_model('decoder_v6.h5',compile=False)

In [43]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_dec_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, char2int['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    decoded_list =[]
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = int2char[sampled_token_index]
        decoded_sentence += sampled_char
        decoded_list.append(sampled_char)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_dec_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_dec_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence,decoded_list

In [56]:
pred_sent_train = []
for seq_index in range(num_samples):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence,lst = decode_sequence(input_seq)
    pred_sent_train.append(lst)
    print('-')
    print('Wrong sentence:', input_texts[seq_index])
    print('Corrected sentence:', decoded_sentence)
    print('Ground Truth:',target_texts[seq_index])

-
Wrong sentence: โรงพยาบาลส่งเสรมสุขภาพ ศูนย์อนามัยที่ 7
Corrected sentence: โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่ 7

Ground Truth: 	โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่ 7

-
Wrong sentence: โรงพยาบาลส่งเสิมสุขภาพ ศูนย์อนามัยที่10
Corrected sentence: โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่10

Ground Truth: 	โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่10

-
Wrong sentence: โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่1
Corrected sentence: โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่11

Ground Truth: 	โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่11

-
Wrong sentence: โรงพยาบาลส่งเสริมสุขภาพ ศูย์อนามัยที่4
Corrected sentence: โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่4

Ground Truth: 	โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่4

-
Wrong sentence: โรงพยาบาลส่งเสริมสุขภาพเชีงใหม่
Corrected sentence: โรงพยาบาลส่งเสริมสุขภาพเชียงใหม่

Ground Truth: 	โรงพยาบาลส่งเสริมสุขภาพเชียงใหม่

-
Wrong sentence: โรงพยบาลสตึก
Corrected sentence: โรงพยาบาลสตึก

Ground Truth: 	โรงพยาบาลสตึก

-
Wrong sentence: โรงพยาบาลสูล
Corrected sente

In [55]:
test_x = []
test_y = []
for i,line in enumerate(hospital_true_test):
    output_text = '\t' + line + '\n'
    test_x.append(hospital_gen_test[i].replace('\u200b',''))
    test_y.append(output_text.replace('\u200b',''))
print("LEN OF SAMPLES:",len(test_y))

LEN OF SAMPLES: 6475


In [56]:
max_enc_len_test = max([len(x)+x.count(' ') for x in test_x])
max_dec_len_test = max([len(x)+x.count(' ') for x in test_y])
print("Max Enc Len:",max_enc_len_test)
print("Max Dec Len:",max_dec_len_test)

Max Enc Len: 75
Max Dec Len: 75


In [57]:
##### Test Set #####
num_samples_test = len(test_x)
encoder_input_data_test = np.zeros( (num_samples_test , max_enc_len_test , len(char_set)+3),dtype='float32' )
decoder_input_data_test = np.zeros( (num_samples_test , max_dec_len_test , len(char_set)+3),dtype='float32' )
decoder_target_data_test = np.zeros( (num_samples_test , max_dec_len_test , len(char_set)+3),dtype='float32' )
print("CREATED ZERO VECTORS")

CREATED ZERO VECTORS


In [58]:
#filling in the enc,dec datas
for i,(input_text,target_text) in enumerate(zip(test_x,test_y)):
    for t,char in enumerate(input_text):
        encoder_input_data_test[ i , t , char2int[char] ] = 1
    for t,char in enumerate(target_text):
        decoder_input_data_test[ i, t , char2int[char] ] = 1
        if t > 0 :
            decoder_target_data_test[ i , t-1 , char2int[char] ] = 1
print("COMPLETED...")  

COMPLETED...


In [49]:
model.evaluate([encoder_input_data_test,decoder_input_data_test],decoder_target_data_test)



[0.0021924177788397816, 0.28835458]

In [61]:
pred_sent_test = []
for seq_index in range(num_samples_test):
    input_seq = encoder_input_data_test[seq_index: seq_index + 1]
    decoded_sentence,lst = decode_sequence(input_seq)
    pred_sent_test.append(lst)
    sent = str(''.join(lst))
    sent_2 = sent.strip('\n')
    print(seq_index)
    '''
    print('-')
    print('Wrong sentence:', input_texts[seq_index])
    print('Corrected sentence:', sent_2)
    print('Ground Truth:',target_texts[seq_index])
    '''

2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674


In [62]:
count =0
wrong_pred =[]
wrong_true =[]
wrong_input = []
for i,key in enumerate(test_y):
    test = key[1:-1] 
    if test == pred_test[i]:
        count += 1
    else:
        wrong_pred.append(pred_test[i])
        wrong_true.append(test)
        wrong_input.append(test_x[i])

IndexError: list index out of range

In [45]:
count

6342

In [46]:
len(test_y)

6588

In [47]:
count - len(test_y)

-246

In [48]:
count/len(test_y)

0.9626593806921676

In [49]:
#### Incorrect sentence ####
for i,value in enumerate(wrong_input):
    print('Input:', wrong_input[i])
    print('Prediction:', wrong_pred[i])
    print('Gound Thrust:', wrong_true[i])
    print('-')


Input: โรงพยาบาลพะสมุทเจดีย์
Prediction: โรงพยาบาลพยุสมเด็จ
Gound Thrust: โรงพยาบาลพระสมุทรเจดีย์
-
Input: โรงพยาบาลจัฒนเวชสุโขทัย
Prediction: โรงพยาบาลปิยะเวชช์บ่อวิน
Gound Thrust: โรงพยาบาลพัฒนเวชสุโขทัย
-
Input: โรงพยาบาลภฃกระดึค
Prediction: โรงพยาบาลสร้าค
Gound Thrust: โรงพยาบาลภูกระดึง
-
Input: โรงพยาบาลษาษีไศล
Prediction: โรงพยาบาลบาลีลกว
Gound Thrust: โรงพยาบาลราษีไศล
-
Input: โรงพยาบาลลาขวัญ
Prediction: โรงพยาบาลหลาขวัญ
Gound Thrust: โรงพยาบาลเลาขวัญ
-
Input: โรงพยาบาลวิงม่วงสัทธรรม
Prediction: โรงพยาบาลม่วงสามสิบ
Gound Thrust: โรงพยาบาลวังม่วงสัทธรรม
-
Input: โรงพยาบฬาลส่งเสริมสุขภาพ ศูนย์อนามัยที่ 2
Prediction: โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่ 2
Gound Thrust: โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่ 12
-
Input: โรงพๅาบาลสฉงเสริมสุขภาพ ศู๋ย์อนาใมัยที่ 3
Prediction: โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่ 6
Gound Thrust: โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่ 3
-
Input: โรงพยาบาลส่งเสิมสุขภาพซศูนย์อนามัยที่ 5
Prediction: โรงพยาบาลส่งเสริมสุขภาพ ศูนย์อนามัยที่ 7
Gound Thrust

In [50]:
wrong_pred[3],wrong_input[3],wrong_true[3]

('โรงพยาบาลโพธิ์ชุ่', 'โรงพยาบาลโุดรัง', 'โรงพยาบาลกุดรัง')

In [51]:
df_wrong_input = pd.DataFrame(wrong_input)
df_wrong_true = pd.DataFrame(wrong_true)

In [52]:
df_wrong_input.columns=['gen_data']
df_wrong_true.columns=['true_data']

In [53]:
df_col_merged =pd.concat([df_wrong_input,df_wrong_true],axis=1)

In [54]:
df_col_merged.to_csv('hospital_wrong6.csv',index=False)

In [55]:
from char_error_rate import get_char_error_rate

In [56]:
def createSpace_for_cer(text):
    re = []
    for t in text :
        re.append(t)
        
    #re.append("<eow>")
    #pad_list = ["<pad>"]*(max_seq-len(re)+index)
    #re_pad = re + pad_list
    return re

In [57]:
pred_test_output = []
for txt in pred_test:
    pred_test_output.append(createSpace_for_cer(txt))

In [58]:
ground_test =[]
for txt in test_y:
    ground_test.append(createSpace_for_cer(txt))

In [59]:
input_test = []
for txt in test_x:
    input_test.append(createSpace_for_cer(txt))

In [60]:
cer = 0
for i,key in enumerate(ground_test):
    cer += get_char_error_rate(ground_test[i],pred_test_output[i])
    #print(get_char_error_rate(ground_test[i],pred_test_output[i]))
cer_mean = cer/len(ground_test)  
print('Character Error Rate for Test Set:{}%'.format(cer_mean))

Character Error Rate for Test Set:10.360453138827248%


In [61]:
cer = 0
for i,key in enumerate(ground_test):
    cer += get_char_error_rate(ground_test[i],input_test[i])
    #print(get_char_error_rate(ground_test[i],pred_test_output[i]))
cer_mean = cer/len(ground_test)  
print('Character Error Rate for Test Set:{}%'.format(cer_mean))

Character Error Rate for Test Set:15.09780709112849%


In [81]:
test_x[50],test_y[50]

('โรงพยาบาพรหมบุรี', '\tโรงพยาบาลพรหมบุรี\n')

In [112]:
pred_sent_test[50]

['โ',
 'ร',
 'ง',
 'พ',
 'ย',
 'า',
 'บ',
 'า',
 'ล',
 'พ',
 'ร',
 'ห',
 'ม',
 'บ',
 'ุ',
 'ร',
 'ี',
 '\n']

In [210]:
text50 = ''.join(pred_sent_test[50])
text50

'โรงพยาบาลพรหมบุรี\n'

In [108]:
input_seq = encoder_input_data_test[50:51]

In [87]:
input_seq.shape

(1, 52, 93)

In [101]:
input_seq[0,0,:]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [109]:
states_value = encoder_model.predict(input_seq)

In [202]:
states_value[0].shape

(1, 256)

In [203]:
states_value[1].shape

(1, 256)

In [110]:
target_seq = np.zeros((1, 1, num_dec_tokens))

In [95]:
target_seq.shape

(1, 1, 93)

In [96]:
target_seq[0, 0, char2int['\t']] = 1

In [97]:
target_seq

array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]]])

In [105]:
out = [target_seq]+states_value
len(out)

3

In [205]:
output_tokens,h,c = decoder_model.predict([target_seq] + states_value)

In [None]:
sampled_token_index = np.argmax(output_tokens[0, -1, :])

In [208]:
np.argmax(output_tokens[0,-1,:])

91

In [111]:
stop_condition = False
decoded_sentence = ''
decoded_list =[]
while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

    # Sample a token
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = int2char[sampled_token_index]
    decoded_sentence += sampled_char
    
    decoded_list.append(sampled_char)
    print('Decoder seq:{}'.format(''.join(decoded_list)))
    print('Character:{}'.format(sampled_char))

    # Exit condition: either hit max length
    # or find stop character.
    if (sampled_char == '\n' or len(decoded_sentence) > max_dec_len):
            stop_condition = True

        # Update the target sequence (of length 1).
    target_seq = np.zeros((1, 1, num_dec_tokens))
    target_seq[0, 0, sampled_token_index] = 1.

        # Update states
    states_value = [h, c]

Decoder seq:โ
Character:โ
Decoder seq:โร
Character:ร
Decoder seq:โรง
Character:ง
Decoder seq:โรงพ
Character:พ
Decoder seq:โรงพย
Character:ย
Decoder seq:โรงพยา
Character:า
Decoder seq:โรงพยาบ
Character:บ
Decoder seq:โรงพยาบา
Character:า
Decoder seq:โรงพยาบาล
Character:ล
Decoder seq:โรงพยาบาลพ
Character:พ
Decoder seq:โรงพยาบาลพร
Character:ร
Decoder seq:โรงพยาบาลพรห
Character:ห
Decoder seq:โรงพยาบาลพรหม
Character:ม
Decoder seq:โรงพยาบาลพรหมบ
Character:บ
Decoder seq:โรงพยาบาลพรหมบุ
Character:ุ
Decoder seq:โรงพยาบาลพรหมบุร
Character:ร
Decoder seq:โรงพยาบาลพรหมบุรี
Character:ี
Decoder seq:โรงพยาบาลพรหมบุรี

Character:

