In [1]:
# !pip uninstall googletrans

In [2]:
import tensorflow as tf

from gensim.models.keyedvectors import KeyedVectors
from konlpy.tag import Mecab

from googletrans import Translator

import time
import numpy as np

import os
import sys
import urllib.request
import requests
import datetime
import pickle
import json

In [3]:
m = Mecab()

In [4]:
def embedding_load():
    """
    Load pretrained embedding vec
    """
    en_model = KeyedVectors.load_word2vec_format('./fasttext/wiki.en.vec')
    ko_model = KeyedVectors.load_word2vec_format('./fasttext/wiki.ko.vec')
    
    return en_model, ko_model

def get_translate(text, lan, c_id, c_key):
    data = {'text' : text,
            'source' : 'ko',
            'target': lan}

    url = "https://openapi.naver.com/v1/papago/n2mt"

    header = {"X-Naver-Client-Id":c_id,
              "X-Naver-Client-Secret":c_key}

    response = requests.post(url, headers=header, data= data)
    rescode = response.status_code

    if(rescode==200):
        t_data = response.json()
        return t_data['message']['result']['translatedText']
    else:
        print("Error Code:" , rescode)

def proper_noun(ko_model, method=None):
    """
    
    """
    tic = time.time()
    print(f'Proper_noun extract process start')
    print(f'Method selected : {method}')
    
    ko_noun = [i for i in list(ko_model.vocab) if m.pos(i)[0][1] == 'NNP' and len(i) > 1]
    ko_noun_sample = ko_noun[1001:3000]
    
    en_noun = []
    
#     return 0
    if method == "google trans":
        translator = Translator()
        for i in ko_noun_sample:
            en_noun.append(translator.translate(i, src='ko', dest='en').text)

            time.sleep(1)
    
    elif method == "naver papago":
        client_id = 'Np0XYEpo30dZjQ4CwKjo'
        client_secret = 'l6CQ7zor3F'
        
        for i in ko_noun_sample:
            """
            if API requests exceeds the allowed, return None
            """
            try: 
                en_noun.append(get_translate(i, 'en', client_id, client_secret).lower())
            except:
                pass
    
    tok = time.time()
    tictok = str(datetime.timedelta(seconds=tok-tic))
    print(f'Proper_noun extract process end. Time spend : {tictok}')
    
    return ko_noun_sample, en_noun

In [6]:
en_model, ko_model = embedding_load()

In [10]:
result = translator.translate('안녕하세요.', dest="ja")

In [49]:
translator = Translator()
translated_obj = translator.translate('미국의', src='ko', dest='en')
print(translated_obj.text)

미국의


In [31]:
translated_obj = translator.translate('한국의', src='ko', dest='en')
print(translated_obj.text)

한국의


In [38]:
en_noun = [translator.translate(i, src='ko').text for i in list(ko_noun)[:10]]

In [39]:
en_noun

['대한민국의', '미국의', '일본', '대한민국', '미국', '올림픽', '등의', '서울', '위키백과', '한국']

In [43]:
translator.translate('안녕', src='ko', dest='en').text

'안녕'

In [89]:
ko_noun_sample, en_noun_sample = proper_noun(ko_model, method='naver papago')

Proper_noun extract process start
Method selected : naver papago
Proper_noun extract process end. Time spend : 0:02:49.452392


In [8]:
ko_noun_sample_add1, en_noun_sample_add1 = proper_noun(ko_model, method='naver papago')

Proper_noun extract process start
Method selected : naver papago
Proper_noun extract process end. Time spend : 0:05:28.952735


In [15]:
ko_noun_sample_add1[:20], en_noun_sample_add1[:20]

(['러일',
  '여우조연상',
  '조반니',
  '피렌체',
  '영국이',
  '순조',
  '제리',
  '아사미',
  '평택시',
  '경기도교육청',
  '에디션',
  '중국을',
  '로열',
  '페리',
  '이탈리아에서',
  '도쿄에서',
  '취리히',
  '고든',
  '레이싱',
  '스톤'],
 ['reil',
  'best supporting actress',
  'giovanni',
  'florence',
  'the united kingdom',
  'smooth sailing',
  'jerry.',
  'asami',
  'pyeongtaek city',
  'gyeonggi-do office of education',
  'edition',
  'china',
  'royal',
  'perry',
  'in italy',
  'in tokyo',
  'zurich',
  '"gordon, california"',
  'racing',
  'stone'])

In [11]:
with open('./data/ko_noun_dict.pkl', "rb") as f:
    ko_dict = pickle.load(f)

In [12]:
ko_dict

{'미국의': array([-0.2348   ,  0.16632  , -0.42928  , -0.18025  , -0.15354  ,
        -0.16659  , -0.11349  , -0.19434  , -0.27161  , -0.14352  ,
         0.0027943,  0.06654  ,  0.1639   , -0.039142 , -0.051727 ,
         0.0071263,  0.21649  , -0.070307 , -0.042044 , -0.2936   ,
        -0.11884  ,  0.15512  , -0.084749 , -0.24455  , -0.12771  ,
        -0.34986  ,  0.36895  , -0.0077137,  0.13818  , -0.1575   ,
         0.055542 ,  0.047616 ,  0.33078  , -0.67881  , -0.38748  ,
        -0.19999  ,  0.16253  , -0.21847  ,  0.0037893,  0.016311 ,
        -0.18195  , -0.21904  ,  0.056312 , -0.25407  , -0.377    ,
         0.55629  ,  0.19335  , -0.20228  ,  0.21381  ,  0.0072553,
         0.59345  ,  0.29593  , -0.24016  ,  0.038687 ,  0.1417   ,
         0.077105 ,  0.23093  ,  0.167    ,  0.13976  ,  0.077893 ,
         0.25621  ,  0.20661  , -0.4835   , -0.20328  ,  0.16011  ,
         0.048238 , -0.093935 ,  0.37266  ,  0.37774  , -0.2964   ,
         0.34253  , -0.015241 , -0.41455 

In [91]:
ko_dict = {}
en_dict = {}
for i in range(len(ko_noun_sample)):
    try: 
        en_dict[en_noun_sample[i]] = en_model.get_vector(en_noun_sample[i])
        ko_dict[ko_noun_sample[i]] = ko_model.get_vector(ko_noun_sample[i])
        
    except:
        pass

In [None]:
def pickle_save(save_path=None):
    if save_path is not None:
        

In [99]:
len(en_dict.keys())

576

In [101]:
with open("ko_noun_dict.pkl", "wb") as fp:
    pickle.dump(ko_dict, fp)

with open("en_noun_dict.pkl", "wb") as fp:
    pickle.dump(en_dict, fp)

In [106]:
np.save('ko_dict.npy', ko_dict)
np.save('en_dict.npy', en_dict)

In [153]:
list(ko_dict.values())[0]

array([-0.2348   ,  0.16632  , -0.42928  , -0.18025  , -0.15354  ,
       -0.16659  , -0.11349  , -0.19434  , -0.27161  , -0.14352  ,
        0.0027943,  0.06654  ,  0.1639   , -0.039142 , -0.051727 ,
        0.0071263,  0.21649  , -0.070307 , -0.042044 , -0.2936   ,
       -0.11884  ,  0.15512  , -0.084749 , -0.24455  , -0.12771  ,
       -0.34986  ,  0.36895  , -0.0077137,  0.13818  , -0.1575   ,
        0.055542 ,  0.047616 ,  0.33078  , -0.67881  , -0.38748  ,
       -0.19999  ,  0.16253  , -0.21847  ,  0.0037893,  0.016311 ,
       -0.18195  , -0.21904  ,  0.056312 , -0.25407  , -0.377    ,
        0.55629  ,  0.19335  , -0.20228  ,  0.21381  ,  0.0072553,
        0.59345  ,  0.29593  , -0.24016  ,  0.038687 ,  0.1417   ,
        0.077105 ,  0.23093  ,  0.167    ,  0.13976  ,  0.077893 ,
        0.25621  ,  0.20661  , -0.4835   , -0.20328  ,  0.16011  ,
        0.048238 , -0.093935 ,  0.37266  ,  0.37774  , -0.2964   ,
        0.34253  , -0.015241 , -0.41455  , -0.67799  , -0.2364

In [154]:
def get_noun_data(ko_dict, en_dict):
    for i in range(len(ko_dict)):
        ko_vector = list(ko_dict.values())[i]
        en_vector = list(en_dict.values())[i]
        
#         yield (ko_word, ko_vector, en_word, en_vector)
        yield (ko_vector, en_vector)
    
def get_noun_data_2(ko_vec, en_vec):
    for i in range(len(ko_vec)):
        ko_vector = ko_vec[i]
        en_vector = en_vec[i]
        
        yield ko_vector, en_vector

In [155]:
gen = get_noun_data_2(list(ko_dict.values()), list(en_dict.values()))

In [156]:
next(gen)

(array([-0.2348   ,  0.16632  , -0.42928  , -0.18025  , -0.15354  ,
        -0.16659  , -0.11349  , -0.19434  , -0.27161  , -0.14352  ,
         0.0027943,  0.06654  ,  0.1639   , -0.039142 , -0.051727 ,
         0.0071263,  0.21649  , -0.070307 , -0.042044 , -0.2936   ,
        -0.11884  ,  0.15512  , -0.084749 , -0.24455  , -0.12771  ,
        -0.34986  ,  0.36895  , -0.0077137,  0.13818  , -0.1575   ,
         0.055542 ,  0.047616 ,  0.33078  , -0.67881  , -0.38748  ,
        -0.19999  ,  0.16253  , -0.21847  ,  0.0037893,  0.016311 ,
        -0.18195  , -0.21904  ,  0.056312 , -0.25407  , -0.377    ,
         0.55629  ,  0.19335  , -0.20228  ,  0.21381  ,  0.0072553,
         0.59345  ,  0.29593  , -0.24016  ,  0.038687 ,  0.1417   ,
         0.077105 ,  0.23093  ,  0.167    ,  0.13976  ,  0.077893 ,
         0.25621  ,  0.20661  , -0.4835   , -0.20328  ,  0.16011  ,
         0.048238 , -0.093935 ,  0.37266  ,  0.37774  , -0.2964   ,
         0.34253  , -0.015241 , -0.41455  , -0.6

In [157]:
dataset = tf.data.Dataset.from_generator(get_noun_data_2, 
                              (tf.float64, tf.float64),
                              (tf.TensorShape([300]), tf.TensorShape([300])),
                               args=(list(ko_dict.values()), list(en_dict.values())))

In [158]:
dataset

<FlatMapDataset shapes: ((300,), (300,)), types: (tf.float64, tf.float64)>

In [189]:
from model.transformer import * 

In [190]:
sample_enc = Encoder(num_layers=1, d_model=64, num_heads=8, dff=512, input_vocab_size=0, maximum_position_encoding=0)

In [191]:
temp_input = tf.random.uniform((64, 62), dtype=tf.int64, minval=0, maxval=200)
sample_encoder_output = sample_enc(temp_input, training=False, mask=None)
print(sample_encoder_output.shape)

AttributeError: 'Encoder' object has no attribute 'embedding'