In [1]:
import tensorflow as tf

from gensim.models.keyedvectors import KeyedVectors
from konlpy.tag import Mecab
from models import AE
from scipy.stats import ortho_group, wasserstein_distance

# from googletrans import Translator
from models.transformer import * 

import time
import numpy as np

import os
import sys
import urllib.request
import requests
import datetime
import shutil
import pickle
import json

In [2]:
with open('./data/ko_noun_dict(vocab_size-1250).pkl', "rb") as f:
    ko_dict = pickle.load(f)
    
with open('./data/en_noun_dict(vocab_size-1250).pkl', "rb") as f:
    en_dict = pickle.load(f)

In [3]:
with open('./data/ko_noun_dict(vocab_size-1250).pkl', "rb") as f:
    ko_noun_dict = pickle.load(f)
    
with open('./data/en_noun_dict(vocab_size-1250).pkl', "rb") as f:
    en_noun_dict = pickle.load(f)
    
with open('./data/ko_verb_dict(vocab_size-547).pkl', "rb") as f:
    ko_verb_dict = pickle.load(f)
    
with open('./data/en_verb_dict(vocab_size-547).pkl', "rb") as f:
    en_verb_dict = pickle.load(f)
    
with open('./data/ko_v_1.pkl', "rb") as f:
    ko_adjective_dict = pickle.load(f)
    
with open('./data/en_v_1.pkl', "rb") as f:
    en_adjective_dict = pickle.load(f)
    
with open('./data/ko_verb_1.pkl', "rb") as f:
    ko_verb_dict_2 = pickle.load(f)
    
with open('./data/en_verb_1.pkl', "rb") as f:
    en_verb_dict_2 = pickle.load(f)

In [4]:
ko_data = np.array(list(ko_noun_dict.values()) + list(ko_verb_dict.values()) + list(ko_verb_dict_2.values()) + list(ko_adjective_dict.values()))
en_data = np.array(list(en_noun_dict.values()) + list(en_verb_dict.values()) + list(en_verb_dict_2.values()) + list(en_adjective_dict.values())) 
ko_noun_data = np.array(list(ko_noun_dict.values()))
en_noun_data = np.array(list(en_noun_dict.values())) 
ko_verb_data = np.array(list(ko_verb_dict.values()) + list(ko_verb_dict_2.values()))
en_verb_data = np.array(list(en_verb_dict.values()) + list(en_verb_dict_2.values())) 
ko_adjective_data = np.array(list(ko_adjective_dict.values()))
en_adjective_data = np.array(list(en_adjective_dict.values()))

In [5]:
latent_dim = 250
autoencoder = AE.AutoEncoder(latent_dim)

In [6]:
def autoencoder_test(latent_dim, train_data, label_data, ko_data, en_data, num_sample=500, shuffle=True):
    
    autoencoder = AE.AutoEncoder(latent_dim)
    autoencoder.compile(optimizer='adam', loss=tf.losses.MeanSquaredError())
    autoencoder.fit(train_data, label_data, epochs=100, shuffle=True)
    
    wasserstein_error = 0
    for i in range(len(ko_data)):

        wasserstein_error += wasserstein_distance(ko_data[i], autoencoder(en_data)[i])

    print(wasserstein_error / len(ko_data))

In [12]:
def linear_test(latent_dim, train_data, label_data, ko_data, en_data, num_sample=500, shuffle=True):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(300, bias_initializer='zero'))
    opt = tf.keras.optimizers.Adam()
    model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
    es = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', verbose=1, patience=50)
    model.fit(en_noun_data, ko_noun_data, epochs=500, batch_size=64, callbacks=[es]) 
    
    wasserstein_error = 0
    for i in range(len(ko_data)):

        wasserstein_error += wasserstein_distance(ko_data[i], model(en_data)[i])

    print(wasserstein_error / len(ko_data))

In [8]:
shuffle_idx = np.arange((ko_data.shape[0]))
np.random.shuffle(shuffle_idx)

In [9]:
shuffle_idx

array([2310,  789, 1338, ...,  573, 3010,  243])

In [10]:
autoencoder_test(250, ko_data[shuffle_idx][:500], en_data[shuffle_idx][:500], ko_data, en_data)

Train on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
0.26684758558800037


In [13]:
linear_test(250, ko_data[shuffle_idx][:500], en_data[shuffle_idx][:500], ko_data, en_data)

Train on 1250 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500

Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500


Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 

Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 234/500
Epoch 235/500
Epoch 236/500
Epoch 237/500
Epoch 238/500
Epoch 239/500
Epoch 240/500
Epoch 241/500
Epoch 242/500
Epoch 243/500
Epoch 244/500
Epoch 245/500
Epoch 246/500
Epoch 247/500
Epoch 248/500
Epoch 249/500
Epoch 250/500
Epoch 251/500
Epoch 252/500
Epoch 253/500
Epoch 254/500
Epoch 255/500
Epoch 256/500
Epoch 257/500
Epoch 258/500
Epoch 259/500
Epoch 260/500
Epoch 261/500
Epoch 262/500
Epoch 263/500
Epoch 264/500
Epoch 265/500
Epoch 266/500
Epoch 267/500
Epoch 268/500
Epoch 269/500
Epoch 270/500
Epoch 271/500
Epoch 272/500
Epoch 273/500
Epoch 274/500
Epoch 275/500
Epoch 276/500
Epoch 277/500
Epoch 278/500
Epoch 279/500
Epoch 280/500
Epoch 281/500
Epoch 00281: early stopping
0.040272055613806056
