In [1]:
# !pip uninstall googletrans

In [2]:
import tensorflow as tf

from gensim.models.keyedvectors import KeyedVectors
from konlpy.tag import Mecab

from googletrans import Translator

import time
import numpy as np

import os
import sys
import urllib.request
import requests
import datetime
import pickle
import json

In [3]:
m = Mecab()

In [73]:
def embedding_load():
    """
    Load pretrained embedding vec
    """
    en_model = KeyedVectors.load_word2vec_format('./fasttext/wiki.en.vec')
    ko_model = KeyedVectors.load_word2vec_format('./fasttext/wiki.ko.vec')
    
    return en_model, ko_model

def get_translate(text, lan, c_id, c_key):
    data = {'text' : text,
            'source' : 'ko',
            'target': lan}

    url = "https://openapi.naver.com/v1/papago/n2mt"

    header = {"X-Naver-Client-Id":c_id,
              "X-Naver-Client-Secret":c_key}

    response = requests.post(url, headers=header, data= data)
    rescode = response.status_code

    if(rescode==200):
        t_data = response.json()
        return t_data['message']['result']['translatedText']
    else:
        print("Error Code:" , rescode)

def proper_noun(ko_model, method=None):
    """
    
    """
    tic = time.time()
    print(f'Proper_noun extract process start')
    print(f'Method selected : {method}')
    
    ko_noun = [i for i in list(ko_model.vocab) if m.pos(i)[0][1] == 'VV' and len(i) > 1]
    ko_noun_sample = ko_noun[:3000]
    
    en_noun = []
    
    if method == "google trans":
        translator = Translator()
        for i in ko_noun_sample:
            en_noun.append(translator.translate(i, src='ko', dest='en').text)

            time.sleep(1)
    
    elif method == "naver papago":
        client_id = 'Np0XYEpo30dZjQ4CwKjo'
        client_secret = 'l6CQ7zor3F'
        
        for i in ko_noun_sample:
            """
            if API requests exceeds the allowed, return None
            """
            try: 
                en_noun.append(get_translate(i, 'en', client_id, client_secret).lower())
            except:
                pass
    
    tok = time.time()
    tictok = str(datetime.timedelta(seconds=tok-tic))
    print(f'Proper_noun extract process end. Time spend : {tictok}')
    
    return ko_noun_sample, en_noun

def add_vocab(source, ko_dict, en_dict, method=None):
    
    
    if method == "google trans":
        translator = Translator()
        for i in ko_noun_sample:
            en_noun.append(translator.translate(i, src='ko', dest='en').text)

            time.sleep(1)
    
    elif method == "naver papago":
        client_id = 'Np0XYEpo30dZjQ4CwKjo'
        client_secret = 'l6CQ7zor3F'
        
        for i in source:
            """
            if API requests exceeds the allowed, return None
            """
            try: 
                en_noun.append(get_translate(i, 'en', client_id, client_secret).lower())
            except:
                pass
    

In [6]:
en_model, ko_model = embedding_load()

In [10]:
result = translator.translate('안녕하세요.', dest="ja")

In [31]:
translated_obj = translator.translate('한국의', src='ko', dest='en')
print(translated_obj.text)

한국의


In [38]:
en_noun = [translator.translate(i, src='ko').text for i in list(ko_noun)[:10]]

In [39]:
en_noun

['대한민국의', '미국의', '일본', '대한민국', '미국', '올림픽', '등의', '서울', '위키백과', '한국']

In [43]:
translator.translate('안녕', src='ko', dest='en').text

'안녕'

In [50]:
ko_noun_sample, en_noun_sample = proper_noun(ko_model, method='naver papago')

Proper_noun extract process start
Method selected : naver papago
Proper_noun extract process end. Time spend : 0:06:19.501213


In [56]:
len(ko_noun_sample), len(en_noun_sample)

(1999, 1999)

In [49]:
ko_dict = {}
en_dict = {}

In [8]:
# ko_noun_sample_add1, en_noun_sample_add1 = proper_noun(ko_model, method='naver papago')

Proper_noun extract process start
Method selected : naver papago
Proper_noun extract process end. Time spend : 0:05:28.952735


In [16]:
with open('./data/ko_noun_dict.pkl', "rb") as f:
    ko_dict = pickle.load(f)
    
with open('./data/en_noun_dict.pkl', "rb") as f:
    en_dict = pickle.load(f)

In [70]:
if (len(ko_dict) and len(en_dict)):
    print('add')
    for i in range(len(ko_noun_sample_add1)):
        try:
            en_dict[en_noun_sample_add1[i]] = en_model.get_vector(en_noun_sample_add1[i])
            ko_dict[ko_noun_sample_add1[i]] = ko_model.get_vector(ko_noun_sample_add1[i])
            
        except:
            pass
    
else:
    ko_dict = {}
    en_dict = {}
    for i in range(len(en_noun_sample)):
        try: 
            tmp = len(en_dict)
            en_dict[en_noun_sample[i]] = en_model.get_vector(en_noun_sample[i])
            if len(en_dict) == tmp:
                print(en_noun_sample[i])
            else:
                ko_dict[ko_noun_sample[i]] = ko_model.get_vector(ko_noun_sample[i])

        except:
            pass

matsumoto
kobayashi
china
korean
northwest
albert
cloud
russia
hideyoshi
kazakhstan
adam
england
kimura
joseon
lawrence
solidarity
hitler
china
dragons
takeshi
organizing
uzbekistan
girls
valley
russia
wiki
robert
times
manhattan
cameroon
benjamin
alfred
laos
india
joseph
silla
dragons
albert
fighters
bangladesh
matsuda
samuel
lebanon
montenegro
japanese
tachibana
jonathan
union
matsui
guatemala
kikuchi
bird
northwest
margaret
venice
italy
angola
venus
forefather


In [71]:
len(en_dict), len(ko_dict)

(1250, 1250)

In [69]:
ko_dict = {}
en_dict = {}

In [72]:
with open(f'./data/ko_noun_dict(vocab_size-{len(ko_dict)}).pkl', "wb") as fp:
    pickle.dump(ko_dict, fp)

with open(f'./data/en_noun_dict(vocab_size-{len(en_dict)}).pkl', "wb") as fp:
    pickle.dump(en_dict, fp)

In [106]:
np.save('ko_dict.npy', ko_dict)
np.save('en_dict.npy', en_dict)

In [153]:
list(ko_dict.values())[0]

array([-0.2348   ,  0.16632  , -0.42928  , -0.18025  , -0.15354  ,
       -0.16659  , -0.11349  , -0.19434  , -0.27161  , -0.14352  ,
        0.0027943,  0.06654  ,  0.1639   , -0.039142 , -0.051727 ,
        0.0071263,  0.21649  , -0.070307 , -0.042044 , -0.2936   ,
       -0.11884  ,  0.15512  , -0.084749 , -0.24455  , -0.12771  ,
       -0.34986  ,  0.36895  , -0.0077137,  0.13818  , -0.1575   ,
        0.055542 ,  0.047616 ,  0.33078  , -0.67881  , -0.38748  ,
       -0.19999  ,  0.16253  , -0.21847  ,  0.0037893,  0.016311 ,
       -0.18195  , -0.21904  ,  0.056312 , -0.25407  , -0.377    ,
        0.55629  ,  0.19335  , -0.20228  ,  0.21381  ,  0.0072553,
        0.59345  ,  0.29593  , -0.24016  ,  0.038687 ,  0.1417   ,
        0.077105 ,  0.23093  ,  0.167    ,  0.13976  ,  0.077893 ,
        0.25621  ,  0.20661  , -0.4835   , -0.20328  ,  0.16011  ,
        0.048238 , -0.093935 ,  0.37266  ,  0.37774  , -0.2964   ,
        0.34253  , -0.015241 , -0.41455  , -0.67799  , -0.2364

In [78]:
ko_dict = {}
en_dict = {}
for i in range(len(en_noun_sample)):
    try: 
        tmp = len(en_dict)
        en_dict[en_verb_sample[i]] = en_model.get_vector(en_verb_sample[i])
        if len(en_dict) == tmp:
            print(en_noun_sample[i])
        else:
            ko_dict[ko_verb_sample[i]] = ko_model.get_vector(ko_verb_sample[i])

    except:
        pass

stone
ecuador
hamilton
columbia
rio de janeiro
manhattan
namwon city
hokuriku
digimon
obama
italian descent
sicily
miyuki
creative
benedictine
soccer
layuku
arabia
roy
yamazaki
victor
do a
old
joseph
wolfgang
dokdo
mokpo city
thrown away
films
yoshino
roberto
eva!
northeast
yokosuka
reign
raphael
mexico city
jessica
times
carter
yunnan
in china
to france
lyon
boys
korean professional baseball
play
arsenal
mr. jang
stopover
lawrence
troy
arkansas
alexandria
ogawa
batman
anycall
organizing
korean
kim kyu-sik
northwest
bangladesh
daily
dreamworks
gag concert.
uruguayan
ikebukuro
lena
kenyan
wiz
newton
wow
in korea,
emmy award
son dong-kwon
lucy
place as well
malta
wimbledon
edinburgh
mr. shin
albert
litres
ayaka
hideyoshi
larry
kid
it's
division
joshu
mega
europe
choice
storm
lobby
people's army of korea
yusuke
lithuanian
julia.
united arab emirates
soviet army
scandinavia
champions
morioca
matsuyama
cozima
shinagawa
ronald
disputed
shimonoseki
hideyoshi
platinum
cube
kazakhstan
zagreb
ky

In [80]:
len(ko_dict), len(en_dict)

(547, 547)

In [154]:
def get_noun_data(ko_dict, en_dict):
    for i in range(len(ko_dict)):
        ko_vector = list(ko_dict.values())[i]
        en_vector = list(en_dict.values())[i]
        
#         yield (ko_word, ko_vector, en_word, en_vector)
        yield (ko_vector, en_vector)
    
def get_noun_data_2(ko_vec, en_vec):
    for i in range(len(ko_vec)):
        ko_vector = ko_vec[i]
        en_vector = en_vec[i]
        
        yield ko_vector, en_vector

In [155]:
gen = get_noun_data_2(list(ko_dict.values()), list(en_dict.values()))

In [156]:
next(gen)

(array([-0.2348   ,  0.16632  , -0.42928  , -0.18025  , -0.15354  ,
        -0.16659  , -0.11349  , -0.19434  , -0.27161  , -0.14352  ,
         0.0027943,  0.06654  ,  0.1639   , -0.039142 , -0.051727 ,
         0.0071263,  0.21649  , -0.070307 , -0.042044 , -0.2936   ,
        -0.11884  ,  0.15512  , -0.084749 , -0.24455  , -0.12771  ,
        -0.34986  ,  0.36895  , -0.0077137,  0.13818  , -0.1575   ,
         0.055542 ,  0.047616 ,  0.33078  , -0.67881  , -0.38748  ,
        -0.19999  ,  0.16253  , -0.21847  ,  0.0037893,  0.016311 ,
        -0.18195  , -0.21904  ,  0.056312 , -0.25407  , -0.377    ,
         0.55629  ,  0.19335  , -0.20228  ,  0.21381  ,  0.0072553,
         0.59345  ,  0.29593  , -0.24016  ,  0.038687 ,  0.1417   ,
         0.077105 ,  0.23093  ,  0.167    ,  0.13976  ,  0.077893 ,
         0.25621  ,  0.20661  , -0.4835   , -0.20328  ,  0.16011  ,
         0.048238 , -0.093935 ,  0.37266  ,  0.37774  , -0.2964   ,
         0.34253  , -0.015241 , -0.41455  , -0.6

In [157]:
dataset = tf.data.Dataset.from_generator(get_noun_data_2, 
                              (tf.float64, tf.float64),
                              (tf.TensorShape([300]), tf.TensorShape([300])),
                               args=(list(ko_dict.values()), list(en_dict.values())))

In [74]:
ko_verb_sample, en_verb_sample = proper_noun(ko_model, method='naver papago')

Proper_noun extract process start
Method selected : naver papago
Error Code: 504
Error Code: 504
Error Code: 504
Proper_noun extract process end. Time spend : 0:10:23.827025


In [76]:
ko_verb_sample, en_verb_sample

(['되었다',
  '하는',
  '보기',
  '살아있는',
  '하고',
  '하였다',
  '하지만',
  '되어',
  '가지고',
  '않고',
  '않은',
  '되는',
  '받았다',
  '하여',
  '않았다',
  '않는다',
  '따르면',
  '되고',
  '하지',
  '하며',
  '들어',
  '받아',
  '위하여',
  '받은',
  '대하여',
  '보다',
  '받고',
  '보고',
  '하다',
  '되었고',
  '두고',
  '보면',
  '갖고',
  '않습니다',
  '알고',
  '되지',
  '죽은',
  '만들어',
  '의하여',
  '주는',
  '불리는',
  '않다',
  '하면',
  '않아',
  '만드는',
  '의하면',
  '맡았다',
  '가는',
  '남아',
  '하기',
  '가리키는',
  '나오는',
  '쓰는',
  '더불어',
  '하는데',
  '하게',
  '보는',
  '보이는',
  '되었으며',
  '사는',
  '통하여',
  '되면',
  '받는',
  '인하여',
  '하면서',
  '주었다',
  '닫는',
  '만들었다',
  '자는',
  '살고',
  '맡고',
  '되면서',
  '이끄는',
  '쓰이는',
  '쓰고',
  '타고',
  '받지',
  '부르는',
  '하였고',
  '이끌고',
  '나타내는',
  '이르는',
  '하자',
  '않으며',
  '않으면',
  '되며',
  '잇는',
  '받을',
  '알게',
  '가서',
  '가지는',
  '않게',
  '얻었다',
  '하였으나',
  '남은',
  '받게',
  '넘는',
  '되자',
  '둘러보기',
  '않았고',
  '말았다',
  '듣고',
  '보아',
  '하였으며',
  '보이지',
  '높이는',
  '놓고',
  '갖는',
  '않지만',
  '받는다',
  '들고',
  '만들고',
  '들면',
  '주고',
  '얻을',
  '번길',
  '얻은',
  '

In [83]:
with open(f'./data/ko_verb_dict(vocab_size-{len(ko_dict)}).pkl', "wb") as fp:
    pickle.dump(ko_dict, fp)

with open(f'./data/en_verb_dict(vocab_size-{len(en_dict)}).pkl', "wb") as fp:
    pickle.dump(en_dict, fp)

In [84]:
ko_dict

{'하는': array([-0.094249 ,  0.42494  , -0.20604  ,  0.12892  , -0.054444 ,
         0.18967  ,  0.037641 ,  0.076383 , -0.0017118, -0.055959 ,
        -0.33836  ,  0.25018  , -0.02257  , -0.030147 ,  0.31057  ,
         0.29077  ,  0.10817  ,  0.33446  , -0.13597  , -0.15454  ,
        -0.12388  , -0.027503 ,  0.079178 , -0.18426  ,  0.32628  ,
         0.078985 ,  0.26358  , -0.054113 , -0.03416  ,  0.33462  ,
        -0.22894  ,  0.20006  ,  0.1299   , -0.35966  , -0.26735  ,
        -0.0025038, -0.15737  ,  0.082583 , -0.011795 , -0.17548  ,
         0.088322 , -0.16427  , -0.36592  , -0.034449 ,  0.16082  ,
        -0.39021  ,  0.08214  , -0.095953 ,  0.018573 , -0.27027  ,
         0.15739  ,  0.14677  , -0.1527   , -0.25613  , -0.10433  ,
        -0.21966  , -0.40082  ,  0.12182  , -0.33762  , -0.064375 ,
         0.27708  ,  0.17801  ,  0.14884  , -0.095727 ,  0.35548  ,
         0.12912  ,  0.28289  ,  0.22598  , -0.29775  , -0.31164  ,
        -0.19424  ,  0.1869   , -0.16078  