In [38]:
import torch
import nltk
import urllib.request
import zipfile
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec, KeyedVectors

In [6]:
# Create one-hot vector
dog = torch.FloatTensor([1, 0, 0, 0, 0])
cat = torch.FloatTensor([0, 1, 0, 0, 0])
computer = torch.FloatTensor([0, 0, 1, 0, 0])
netbook = torch.FloatTensor([0, 0, 0, 1, 0])
book = torch.FloatTensor([0, 0, 0, 0, 1])

## Cosine similiarity between vectors

In [8]:
print(torch.cosine_similarity(dog, cat, dim=0))
print(torch.cosine_similarity(cat, computer, dim=0))
print(torch.cosine_similarity(computer, netbook, dim=0))
print(torch.cosine_similarity(netbook, book, dim=0))

tensor(0.)
tensor(0.)
tensor(0.)
tensor(0.)


The smiliarity comes out as same values. This is critical to NLP.

## Word2Vec

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ihoney/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/GaoleMeng/RNN-and-FFNN-textClassification/master/ted_en-20160408.xml", filename="ted_en-20160408.xml")

('ted_en-20160408.xml', <http.client.HTTPMessage at 0x12b2e2250>)

In [18]:
targetXML=open('../dataset/punkt/ted_en-20160408.xml', 'r', encoding='UTF8')

In [19]:
# xml 파일로부터 <content>와 </content> 사이의 내용만 가져온다.
target_text = etree.parse(targetXML)
parse_text = '\n'.join(target_text.xpath('//content/text()'))

In [20]:
# 정규 표현식의 sub 모듈을 통해 content 중간에 등장하는 (Audio), (Laughter) 등의 배경음 부분을 제거.
# 해당 코드는 괄호로 구성된 내용을 제거.
content_text = re.sub(r'\([^)]*\)', '', parse_text)

In [21]:
# 입력 코퍼스에 대해서 NLTK를 이용하여 문장 토큰화를 수행.
sent_text = sent_tokenize(content_text)

In [22]:
# 각 문장에 대해서 구두점을 제거하고, 대문자를 소문자로 변환.
normalized_text = []
for string in sent_text:
     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
     normalized_text.append(tokens)

In [23]:
# 각 문장에 대해서 NLTK를 이용하여 단어 토큰화를 수행.
result = []
result = [word_tokenize(sentence) for sentence in normalized_text]

In [24]:
print('Total number of samples : {}'.format(len(result)))

Total number of samples : 273424


In [25]:
for line in result[:3]: # Get 3 samples
    print(line)

['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new']
['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']
['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing']


3 sentences are tokenized successfully.

## Training Word2Vec

In [39]:
model = Word2Vec(sentences=result, vector_size=100, window=5, min_count=5, workers=4, sg=0)

In [40]:
model_result = model.wv.most_similar("man")
print(model_result)

[('woman', 0.8622720837593079), ('guy', 0.8266342878341675), ('boy', 0.7745452523231506), ('lady', 0.773535430431366), ('girl', 0.7556682825088501), ('gentleman', 0.7357664108276367), ('kid', 0.7097716927528381), ('soldier', 0.6983750462532043), ('poet', 0.6854435801506042), ('son', 0.6542476415634155)]


In [41]:
model.wv.save_word2vec_format('../dataset/punkt/eng_w2v') # Save model
loaded_model = KeyedVectors.load_word2vec_format('../dataset/punkt/eng_w2v') # Load model

In [42]:
model_result = loaded_model.most_similar("man")
print(model_result)

[('woman', 0.8622720837593079), ('guy', 0.8266342878341675), ('boy', 0.7745452523231506), ('lady', 0.773535430431366), ('girl', 0.7556682825088501), ('gentleman', 0.7357664108276367), ('kid', 0.7097716927528381), ('soldier', 0.6983750462532043), ('poet', 0.6854435801506042), ('son', 0.6542476415634155)]


## Creating tsv files

In [44]:
!python -m gensim.scripts.word2vec2tensor --input '../dataset/punkt/eng_w2v' --output '../dataset/punkt/eng_w2v'

2021-05-23 17:30:26,351 - word2vec2tensor - INFO - running /Users/ihoney/.pyenv/versions/3.7.8/lib/python3.7/site-packages/gensim/scripts/word2vec2tensor.py --input ../dataset/punkt/eng_w2v --output ../dataset/punkt/eng_w2v
2021-05-23 17:30:26,351 - keyedvectors - INFO - loading projection weights from ../dataset/punkt/eng_w2v
2021-05-23 17:30:27,835 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (21613, 100) matrix of type float32 from ../dataset/punkt/eng_w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2021-05-23T17:30:27.768020', 'gensim': '4.0.1', 'python': '3.7.8 (default, Oct  5 2020, 18:13:32) \n[Clang 12.0.0 (clang-1200.0.32.2)]', 'platform': 'Darwin-20.1.0-x86_64-i386-64bit', 'event': 'load_word2vec_format'}
2021-05-23 17:30:29,494 - word2vec2tensor - INFO - 2D tensor file saved to ../dataset/punkt/eng_w2v_tensor.tsv
2021-05-23 17:30:29,494 - word2vec2tensor - INFO - Tensor metadata file saved to ../dataset/punkt/eng_w2v_metadata.tsv
2021-05-23 17:30:29,

## Visualization

Link: https://projector.tensorflow.org/