## 영어 Word2Vec 실습


### word2vec 학습시키기

In [1]:
# 내장 코퍼스 다운 
import nltk

nltk.download("abc")
nltk.download("punkt")

[nltk_data] Downloading package abc to
[nltk_data]     C:\Users\zzoon\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\abc.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zzoon\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [1]:
# 다운받은 코퍼스 불러오기
from nltk.corpus import abc

corpus = abc.sents()
print("코퍼스의 크기 :", len(corpus))

코퍼스의 크기 : 29059


In [2]:
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=corpus, vector_size=100, window=5, min_count=5, workers=4, sg=0
)

In [7]:
model_result = model.wv.most_similar("man")
print(model_result)

[('woman', 0.9233290553092957), ('skull', 0.9110273122787476), ('Bang', 0.9056601524353027), ('asteroid', 0.905209481716156), ('third', 0.9020007848739624), ('baby', 0.8994045853614807), ('dog', 0.8985887765884399), ('bought', 0.8975103497505188), ('rally', 0.8912405967712402), ('disc', 0.8889071345329285)]


### save model

In [4]:
from gensim.models import KeyedVectors

model.wv.save_word2vec_format("model/w2v")
loaded_model = KeyedVectors.load_word2vec_format("model/w2v")

In [5]:
model_result = loaded_model.most_similar("man")
print(model_result)

[('woman', 0.9233290553092957), ('skull', 0.9110273122787476), ('Bang', 0.9056601524353027), ('asteroid', 0.905209481716156), ('third', 0.9020007848739624), ('baby', 0.8994045853614807), ('dog', 0.8985887765884399), ('bought', 0.8975103497505188), ('rally', 0.8912405967712402), ('disc', 0.8889071345329285)]


### 사전 임베딩 활용

In [None]:
w2v = KeyedVectors.load_word2vec_format(
    "model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin",
    binary=True,
    limit=500000,
)

In [None]:
# 임베딩 확인
print(
    len(w2v.index_to_key)
)  # Gensim 4.0부터는 index_to_key를 활용해 vocab size를 알 수 있습니다.
print(len(w2v["I"]))  # 혹은 단어를 key로 직접 vector를 얻을 수 있습니다.
print(w2v.vectors.shape)

### 임베딩 시각화

https://projector.tensorflow.org/ 에 업로드

In [8]:
# w2v_metadata.tsv, w2v_tensor.tsv 생성
! python -m gensim.scripts.word2vec2tensor --input model/w2v --output model/w2v


2024-07-04 11:09:00,872 - word2vec2tensor - INFO - running c:\Users\zzoon\anaconda3\envs\aiffel\lib\site-packages\gensim\scripts\word2vec2tensor.py --input model/w2v --output model/w2v
2024-07-04 11:09:00,872 - keyedvectors - INFO - loading projection weights from model/w2v
2024-07-04 11:09:01,851 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (10363, 100) matrix of type float32 from model/w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2024-07-04T11:09:01.840316', 'gensim': '4.3.2', 'python': '3.9.19 | packaged by conda-forge | (main, Mar 20 2024, 12:38:46) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'load_word2vec_format'}
2024-07-04 11:09:02,623 - word2vec2tensor - INFO - 2D tensor file saved to model/w2v_tensor.tsv
2024-07-04 11:09:02,624 - word2vec2tensor - INFO - Tensor metadata file saved to model/w2v_metadata.tsv
2024-07-04 11:09:02,624 - word2vec2tensor - INFO - finished running word2vec2tensor.py


## FastText

In [10]:
# 학습
from gensim.models import FastText

fasttext_model = FastText(corpus, window=5, min_count=5, workers=4, sg=1)

In [11]:
# 오타에 강건 
fasttext_model.wv.most_similar("memoryy")

[('memory', 0.9474899768829346),
 ('mechanisms', 0.8646126985549927),
 ('mechanism', 0.8622226715087891),
 ('musical', 0.8615105748176575),
 ('basic', 0.8544692397117615),
 ('imagine', 0.853622317314148),
 ('mechanical', 0.8525650501251221),
 ('technical', 0.842690646648407),
 ('visual', 0.8370323181152344),
 ('intelligence', 0.8364635705947876)]

## Glove

In [13]:
# pretrained model
import gensim.downloader as api

glove_model = api.load("glove-wiki-gigaword-50")  # glove vectors 다운로드



In [14]:
glove_model.most_similar("dog")  # 'dog'과 비슷한 단어 찾기

[('cat', 0.9218004941940308),
 ('dogs', 0.8513158559799194),
 ('horse', 0.7907583713531494),
 ('puppy', 0.7754920721054077),
 ('pet', 0.7724708318710327),
 ('rabbit', 0.7720814347267151),
 ('pig', 0.7490062117576599),
 ('snake', 0.7399188876152039),
 ('baby', 0.7395570278167725),
 ('bite', 0.7387937307357788)]