# 영어 Word2Vec 만들기

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import urllib.request
import zipfile
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize

## 훈련 데이터 이해하기

In [3]:
# 데이터 다운로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/GaoleMeng/RNN-and-FFNN-textClassification/master/ted_en-20160408.xml", filename="ted_en-20160408.xml")

('ted_en-20160408.xml', <http.client.HTTPMessage at 0x7f63c873cad0>)

In [5]:
!ls -l ted_en-20160408.xml


-rw-r--r-- 1 root root 74533638 Sep  3 04:17 ted_en-20160408.xml


In [6]:
!more ted_en-20160408.xml

<?xml version="1.0" encoding="UTF-8"?>
<xml language="en"><file id="1">
  <head>
    <url>http://www.ted.com/talks/knut_haanaes_two_reasons_companies_fail_and_ho
w_to_avoid_them</url>
    <pagesize>72832</pagesize>
    <dtime>Fri Apr 01 00:57:03 CEST 2016</dtime>
    <encoding>UTF-8</encoding>
    <content-type>text/html; charset=utf-8</content-type>
    <keywords>talks, business, creativity, curiosity, goal-setting, innovation, 
motivation, potential, success, work</keywords>
    <speaker>Knut Haanaes</speaker>
    <talkid>2470</talkid>
    <videourl>http://download.ted.com/talks/KnutHaanaes_2015S.mp4</videourl>
    <videopath>talks/KnutHaanaes_2015S.mp4</videopath>
    <date>2015/06/30</date>
    <title>Knut Haanaes: Two reasons companies fail -- and how to avoid them</ti
tle>
    <description>TED Talk Subtitles and Transcript: Is it possible to run a comp
any and reinvent it at the same time? For business strategist Knut Haanaes, the 
ability to innovate after becoming successful is

## 훈련 데이터 전처리하기

In [7]:
targetXML=open('ted_en-20160408.xml', 'r', encoding='UTF8')
target_text = etree.parse(targetXML)  # soup = BeautifulSoup(html, 'html.parser)

In [8]:
# xml 파일로부터 <content>와 </content> 사이의 내용만 가져온다.
parse_text = '\n'.join(target_text.xpath('//content/text()'))

In [9]:
# 정규 표현식의 sub 모듈을 통해 content 중간에 등장하는 (Audio), (Laughter) 등의 배경음 부분을 제거.
# 해당 코드는 괄호로 구성된 내용을 제거.
content_text = re.sub(r'\([^)]*\)', '', parse_text)

In [10]:
# 입력 코퍼스에 대해서 NLTK를 이용하여 문장 토큰화를 수행.
sent_text = sent_tokenize(content_text)

In [11]:
len(sent_text)

273424

In [12]:
# 각 문장에 대해서 구두점을 제거하고, 대문자를 소문자로 변환.
normalized_text = []
for string in sent_text:
     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
     normalized_text.append(tokens)

In [13]:
# 각 문장에 대해서 NLTK를 이용하여 단어 토큰화를 수행.
result = [word_tokenize(sentence) for sentence in normalized_text]

In [14]:
print('총 샘플의 개수 : {}'.format(len(result)))

총 샘플의 개수 : 273424


In [15]:
# 샘플 3개만 출력
for line in result[:3]:
    print(line)

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']
['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']
['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']


## Word2Vect 훈련시키기

In [16]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=result, size=100, window=5, min_count=5, workers=4, sg=0)
# 만약 TypeError: __init__() got an unexpected keyword argument 'size' 라는 에러 발생 시에는
# size 대신 vector_size로 바꿔서 적어주세요.

In [17]:
#입력한 단어에 대해서 가장 유사한 단어들을 출력하는 model.wv.most_similar을 수행한다
model_result = model.wv.most_similar("man")
print(model_result)

[('woman', 0.8605440855026245), ('guy', 0.8309746980667114), ('lady', 0.7980598211288452), ('boy', 0.7555812001228333), ('gentleman', 0.7455815076828003), ('girl', 0.7417638301849365), ('kid', 0.7129889726638794), ('soldier', 0.7039347887039185), ('friend', 0.6773735284805298), ('poet', 0.6741325855255127)]


In [19]:
model.wv.most_similar(positive=['man','girl'],negative=['boy'], topn=3)

[('woman', 0.8575938940048218),
 ('guy', 0.7422807216644287),
 ('lady', 0.728862464427948)]

## Word2Vec 모델 저장하고 로드하기

In [20]:
from gensim.models import KeyedVectors
model.wv.save_word2vec_format('eng_w2v') # 모델 저장
loaded_model = KeyedVectors.load_word2vec_format("eng_w2v") # 모델 로드

In [22]:
!ls -l

total 97772
-rw-r--r-- 1 root root 25578430 Sep  3 04:42 eng_w2v
drwxr-xr-x 1 root root     4096 Sep  1 19:26 sample_data
-rw-r--r-- 1 root root 74533638 Sep  3 04:17 ted_en-20160408.xml


In [23]:
#로드한 모델에 대해서 다시 man과 유사한 단어를 출력
model_result = loaded_model.most_similar("man")
print(model_result)

[('woman', 0.8605440855026245), ('guy', 0.8309746980667114), ('lady', 0.7980598211288452), ('boy', 0.7555812001228333), ('gentleman', 0.7455815076828003), ('girl', 0.7417638301849365), ('kid', 0.7129889726638794), ('soldier', 0.7039347887039185), ('friend', 0.6773735284805298), ('poet', 0.6741325855255127)]


In [25]:
#로드한 모델에 대해서 다시 man과 유사한 단어를 출력
model_result = loaded_model.most_similar("beauty")
print(model_result)

[('compassion', 0.7689740657806396), ('joy', 0.7659393548965454), ('meaning', 0.752810537815094), ('nature', 0.7437452077865601), ('importance', 0.7273917198181152), ('complexity', 0.7227903604507446), ('consciousness', 0.7138163447380066), ('mystery', 0.7022892236709595), ('simplicity', 0.6917059421539307), ('humor', 0.6905499696731567)]


In [27]:
model.most_similar(positive=['soccer'], negative=['study'])

  """Entry point for launching an IPython kernel.


[('tennis', 0.6521077752113342),
 ('polo', 0.639400064945221),
 ('dirt', 0.5886974930763245),
 ('wet', 0.5872987508773804),
 ('golf', 0.5836280584335327),
 ('puppy', 0.5831645131111145),
 ('dining', 0.5817233324050903),
 ('helmet', 0.5763764977455139),
 ('recorder', 0.5678306818008423),
 ('johnny', 0.5677043199539185)]