In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
src_dataset = np.load(file='C:/Users/USER/TF_2_6/user_source.npy', allow_pickle=True)
tar_dataset = np.load(file='C:/Users/USER/TF_2_6/user_target.npy', allow_pickle=True)

#print(len(src_dataset))
#print(len(tar_dataset))

In [None]:
src_df = pd.DataFrame({'src':[]})
for i in range(len(src_dataset)):
    src_df.loc[i] = [src_dataset[i]]

tar_df = pd.DataFrame({'tar':[]})
for i in range(len(tar_dataset)):
    tar_df.loc[i] = [tar_dataset[i]]

df = pd.concat([src_df, tar_df], axis=1)
df.head()

In [None]:
'''
df.tar

for i in range(len(df.tar.values)):
    df.values[i] = '\t' + str(df.values[i]) + '\n'

df.tar
'''

In [None]:
# <sos> = '\t', <eos> = '\n'
df.tar = df.tar.apply(lambda x : [[0, 0]] + x + [[-1, -1]])
df.sample(10)

In [None]:
'''
len(df.src[0])
df.src[0][0]
'''
len(df.tar)

In [None]:
# coordinate set
src_coordinate = []
for line in df.src :
    for i in range(len(line)) :
        src_coordinate.append(line[i])

tar_coordinate = []
for line in df.tar :
    for i in range(len(line)) :
        tar_coordinate.append(line[i])

# size of coordinate set
src_coordinate_size = len(src_coordinate) + 1
tar_coordinate_size = len(tar_coordinate) + 1

# sort and assign index -> check error
#src_coordinate = sorted(list(src_coordinate))
#tar_coordinate = sorted(list(tar_coordinate))

# {[lat, lon]:1, [lat, lon]:2, ...}
src_to_index = dict([(tuple(coordinate), i+1) for i, coordinate in enumerate(src_coordinate)])
tar_to_index = dict([(tuple(coordinate), i+1) for i, coordinate in enumerate(tar_coordinate)])

In [None]:

print(src_to_index)
print(tar_to_index)

#print(src_coordinate_size)
#print(tar_coordinate_size)

In [None]:
# encoding
encoder_input = []
for line in df.src :
    temp_X = []
    for c in line :
        temp_X.append(src_to_index[tuple(c)]) # 딕셔너리의 키는 리스트 불가(튜플은 가능)
    encoder_input.append(temp_X)
#print(encoder_input[:5])

In [None]:
# decoding
decoder_input = []
for line in df.tar :
    temp_X = []
    for c in line :
        temp_X.append(tar_to_index[tuple(c)])
    decoder_input.append(temp_X)
print(decoder_input[:5])

In [None]:
# !!! this code might be not used !!!
# remove start symbol of target which is used for test
decoder_target = []
for line in df.tar :
    t = 0
    temp_X = []
    for c in line :
        if t > 0 :
            temp_X.append(tar_to_index[tuple(c)])
        t += 1
    decoder_target.append(temp_X)
print(decoder_target[:5])

In [None]:
# padding
max_src_len = max([len(line) for line in df.src])
max_tar_len = max([len(line) for line in df.tar])
print(max_src_len)
print(max_tar_len)

In [None]:
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')

In [None]:
print(len(encoder_input))
print(len(decoder_input))
print(len(decoder_target))

In [None]:
# one-hot encoding
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

In [None]:
# seq2seq model teacher forcing
encoder_inputs = Input(shape=(None, src_coordinate_size))
# hidden state size is 256, return encoder state to decoder
encoder_lstm = LSTM(units=512, return_state=True)
# encoder outputs, hidden state, cell state
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, tar_coordinate_size))
decoder_lstm = LSTM(units=512, return_sequences=True, return_state=True)
# use last encoder state as initial state
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
# assign nodes as size of target coordinate
decoder_softmax_layer = Dense(tar_coordinate_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

MODEL_DIR = './model/'
if not os.path.exists(MODEL_DIR) :
    os.mkdir(MODEL_DIR)

modelpath = './model/{epoch:02d}-{val_loss:.4f}.hdf5'
checkpointer = ModelCheckpoint(filepath=modelpath, monitor='val_loss', verbose=1, save_best_only=True)

In [None]:
# !!! control batch_size, epochs to avoid overfitting !!!
#np.expand_dims(encoder_input, axis = 0)
model.fit(x=[encoder_input, decoder_input], y=decoder_target,
          batch_size=64, epochs=50, validation_split=0.2, callbacks=[checkpointer])

In [None]:
model.save('final_project.h5')