In [1]:
import urllib.request
import zipfile
from lxml import etree
import re
import tqdm

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

import gensim
from gensim.models import KeyedVectors, Word2Vec

# nltk : 3.2.5  |  gensim : 3.6.0
print(f'\n>>> nltk : {nltk.__version__}  |  gensim : {gensim.__version__}') 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

>>> nltk : 3.2.5  |  gensim : 3.6.0


## 1. 영어 Word2Vec 만들기 (Word2Vec in English)

In [2]:
%%time

# donwload corpus
urllib.request.urlretrieve("https://raw.githubusercontent.com/GaoleMeng/RNN-and-FFNN-textClassification/master/ted_en-20160408.xml", filename="ted_en-20160408.xml")

# get text data from corpus
with open('ted_en-20160408.xml') as fp:
  target_text = etree.parse(fp)
text_list = target_text.xpath('//content/text()')

# check text data
print(len(text_list), 'texts \n')
for idx, line in enumerate(text_list[:5]):
  print(f'>>> Line {idx}. {line[:500]}...\n')

2085 texts 

>>> Line 0. Here are two reasons companies fail: they only do more of the same, or they only do what's new.
To me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing.
Consider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. A...

>>> Line 1. So there are lands few and far between on Earth itself that are hospitable to humans by any measure, but survive we have. Our primitive ancestors, when they found their homes and livelihood endangered, they dared to make their way into unfamiliar territories in search of better opportunities. And as the descendants of these explorers, we have their nomadic blood coursing through our own veins. But at the same time, distracted by our bread and circuses an

In [3]:
%%time

# pre-process text data
parse_text = '\n'.join(text_list)
content_text = re.sub(r'\([^)]*\)', '', parse_text)  # (Audio), (Laughter) 등의 배경음 제거 - 괄호로 구성된 내용 제거
sent_text = sent_tokenize(content_text)  # 문장 토큰화

result = []
for sentence in tqdm.tqdm_notebook(sent_text):
  sentence = re.sub(r"[^a-z0-9]+", " ", sentence)  # 구두점 제거
  tokens = word_tokenize(sentence.lower())  # 단어 토큰화
  result.append(tokens)

print(len(result), result[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=273424.0), HTML(value='')))


273424 ['ere', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']
CPU times: user 47 s, sys: 762 ms, total: 47.8 s
Wall time: 48.1 s


In [4]:
%%time
# train Word2Vec model
model = Word2Vec(sentences=result, size=100, window=5, min_count=5, workers=4, sg=0)

# check model
display(model.wv.most_similar('man'))

[('woman', 0.8687087297439575),
 ('guy', 0.8160133361816406),
 ('lady', 0.7881927490234375),
 ('girl', 0.7770200967788696),
 ('boy', 0.757689356803894),
 ('gentleman', 0.7470839023590088),
 ('soldier', 0.7310497760772705),
 ('kid', 0.7228984832763672),
 ('person', 0.6674720644950867),
 ('friend', 0.6444056630134583)]

CPU times: user 1min 1s, sys: 372 ms, total: 1min 1s
Wall time: 35.1 s


In [5]:
%%time
# save & load trained model
model.wv.save_word2vec_format(fname='eng_w2v')
loaded_model = KeyedVectors.load_word2vec_format(fname='eng_w2v')

# check loaded model
display(loaded_model.wv.most_similar('man'))

  


[('woman', 0.8687087297439575),
 ('guy', 0.8160133361816406),
 ('lady', 0.7881927490234375),
 ('girl', 0.7770200967788696),
 ('boy', 0.757689356803894),
 ('gentleman', 0.7470839023590088),
 ('soldier', 0.7310497760772705),
 ('kid', 0.7228984832763672),
 ('person', 0.6674720644950867),
 ('friend', 0.6444056630134583)]

CPU times: user 4.38 s, sys: 217 ms, total: 4.6 s
Wall time: 4.47 s


## 2. 한국어 Word2Vec 만들기(네이버 영화 리뷰)

## 3. 한국어 Word2Vec 만들기(위키피디아)

## 4. 사전 훈련된 Word2Vec 임베딩(Pre-trained Word2Vec embedding) 소개