# Word2Vec 실습 ( English )

## 1. Import libraries & data

In [1]:
import re
from lxml import etree
from nltk.tokenize import word_tokenize,sent_tokenize

In [2]:
targetXML = open('ted_en-20160408.xml','r',encoding='UTF8')

target_text = etree.parse(targetXML)
parse_text = '\n'.join(target_text.xpath('//content/text()')) # <content> ~ </content> 사이 내용 가져오기

## 2. Data Preprocessing

In [6]:
# (Audio), (Laughter) 등의 배경음 부분을 제거
content_text = re.sub(r'\([^)]*\)','',parse_text)

In [7]:
# 문장 토큰화
sent_text = sent_tokenize(content_text)

In [10]:
sent_text[7]

'Everybody used them.'

In [11]:
# 구두점 제거 & 소문자화
normalized_text = []

for string in sent_text:
    tokens = re.sub(r'[^a-z0-9]+', ' ', string.lower())
    normalized_text.append(tokens)

In [12]:
result = [word_tokenize(sentence) for sentence in normalized_text]

In [13]:
print(' Number of samples : ', len(result))

 Number of samples :  273424


In [14]:
# ex) take 3 samples
for line in result[:3]:
    print(line)

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']
['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']
['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']


## 3. Word2Vec
- size = dimension of embedded vector
- window = size of context window
- min_count
- workers
- sg=0 : CBOW /  sg=1 : Skip-Gram

![image.png](attachment:image.png)

In [15]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=result,
                size=100, window=5, min_count=5, workers=4, sg=0)



In [16]:
model_result = model.wv.most_similar('man')
model_result

  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.8488868474960327),
 ('guy', 0.7953285574913025),
 ('lady', 0.7730017304420471),
 ('boy', 0.744835376739502),
 ('gentleman', 0.7383678555488586),
 ('girl', 0.7321925163269043),
 ('soldier', 0.7277054786682129),
 ('poet', 0.690149188041687),
 ('kid', 0.6736900806427002),
 ('rabbi', 0.6579049825668335)]

## 4. Save & Load Word2Vec Model

In [17]:
from gensim.models import KeyedVectors
model.wv.save_word2vec_format('w2v_english')
loaded_model = KeyedVectors.load_word2vec_format('w2v_english')

In [18]:
model_result2 = loaded_model.wv.most_similar('man')
model_result2

  """Entry point for launching an IPython kernel.


[('woman', 0.8488868474960327),
 ('guy', 0.7953285574913025),
 ('lady', 0.7730017304420471),
 ('boy', 0.744835376739502),
 ('gentleman', 0.7383678555488586),
 ('girl', 0.7321925163269043),
 ('soldier', 0.7277054786682129),
 ('poet', 0.690149188041687),
 ('kid', 0.6736900806427002),
 ('rabbi', 0.6579049825668335)]