# [NLP-5] Word Embedding

## package install

In [None]:
# $ pip install konlpy

In [1]:
import re
from konlpy.tag import Okt
from collections import Counter

In [7]:
text = "임금님 귀는 당나귀 귀! 임금님 귀는 당나귀 귀! 실컷~ 소리치고 나니 속이 확 뚫려 살 것 같았어."

## preprocessing

In [8]:
reg = re.compile("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]")
text = reg.sub('', text)

In [9]:
print(text)

임금님 귀는 당나귀 귀 임금님 귀는 당나귀 귀 실컷 소리치고 나니 속이 확 뚫려 살 것 같았어


In [5]:
text = text.split()

## tokenizing

In [10]:
# okt morphs
okt=Okt()
tokens = okt.morphs(text)
print(tokens)

['임금님', '귀', '는', '당나귀', '귀', '임금님', '귀', '는', '당나귀', '귀', '실컷', '소리', '치고', '나니', '속이', '확', '뚫려', '살', '것', '같았어']


In [12]:
# split
token_s = text.split()
print(token_s)

['임금님', '귀는', '당나귀', '귀', '임금님', '귀는', '당나귀', '귀', '실컷', '소리치고', '나니', '속이', '확', '뚫려', '살', '것', '같았어']


## vocab

In [18]:
vocab = Counter(tokens)
vocab_s = Counter(token_s)

In [19]:
print(vocab['임금님'])
print(vocab_s['임금님'])

2
2


In [20]:
vocab_size = 5
vocab = vocab.most_common(vocab_size)
vocab_s = vocab_s.most_common(vocab_size)
print(vocab)
print(vocab_s)

[('귀', 4), ('임금님', 2), ('는', 2), ('당나귀', 2), ('실컷', 1)]
[('임금님', 2), ('귀는', 2), ('당나귀', 2), ('귀', 2), ('실컷', 1)]


In [21]:
word2idx = {word[0] : index+1 for index, word in enumerate(vocab)}
word2idx_s = {word[0] : index+1 for index, word in enumerate(vocab_s)}

{'귀': 1, '임금님': 2, '는': 3, '당나귀': 4, '실컷': 5}


In [22]:
print(word2idx)
print(word2idx_s)

{'귀': 1, '임금님': 2, '는': 3, '당나귀': 4, '실컷': 5}
{'임금님': 1, '귀는': 2, '당나귀': 3, '귀': 4, '실컷': 5}


## 원-핫 벡터 만들기

In [23]:
def one_hot_encoding(word, word2index):
    one_hot_vector = [0]*(len(word2index))
    index = word2index[word]
    one_hot_vector[index-1] = 1
    return one_hot_vector
print("슝=3")

슝=3


In [25]:
print(one_hot_encoding("임금님", word2idx))
print(one_hot_encoding("임금님", word2idx_s))

[0, 1, 0, 0, 0]
[1, 0, 0, 0, 0]


In [26]:
#keras one-hot

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
print("임포트 완료")

임포트 완료


In [39]:
t = Tokenizer()
t.fit_on_texts(token_s)
print(t.word_index)

{'임금님': 1, '귀는': 2, '당나귀': 3, '귀': 4, '실컷': 5, '소리치고': 6, '나니': 7, '속이': 8, '확': 9, '뚫려': 10, '살': 11, '것': 12, '같았어': 13}


In [40]:
vocab_size = len(t.word_index) + 1
print("슝=3")

슝=3


In [41]:
encoded = t.texts_to_sequences([text])

In [42]:
one_hot = to_categorical(encoded, num_classes = vocab_size)
print(one_hot)

[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]]


## Word2Vec

In [None]:
# ! pip install nltk
# ! pip install gensim    

In [43]:
import nltk
nltk.download('abc')
nltk.download('punkt')

[nltk_data] Downloading package abc to /aiffel/nltk_data...
[nltk_data]   Package abc is already up-to-date!
[nltk_data] Downloading package punkt to /aiffel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:
from nltk.corpus import abc
corpus = abc.sents()
print("슝~")

슝~


In [47]:
type(corpus)

nltk.corpus.reader.util.ConcatenatedCorpusView

In [48]:
print(corpus[:3])

[['PM', 'denies', 'knowledge', 'of', 'AWB', 'kickbacks', 'The', 'Prime', 'Minister', 'has', 'denied', 'he', 'knew', 'AWB', 'was', 'paying', 'kickbacks', 'to', 'Iraq', 'despite', 'writing', 'to', 'the', 'wheat', 'exporter', 'asking', 'to', 'be', 'kept', 'fully', 'informed', 'on', 'Iraq', 'wheat', 'sales', '.'], ['Letters', 'from', 'John', 'Howard', 'and', 'Deputy', 'Prime', 'Minister', 'Mark', 'Vaile', 'to', 'AWB', 'have', 'been', 'released', 'by', 'the', 'Cole', 'inquiry', 'into', 'the', 'oil', 'for', 'food', 'program', '.'], ['In', 'one', 'of', 'the', 'letters', 'Mr', 'Howard', 'asks', 'AWB', 'managing', 'director', 'Andrew', 'Lindberg', 'to', 'remain', 'in', 'close', 'contact', 'with', 'the', 'Government', 'on', 'Iraq', 'wheat', 'sales', '.']]


In [49]:
print('코퍼스의 크기 :',len(corpus))

코퍼스의 크기 : 29059


In [50]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = corpus, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)
print("모델 학습 완료!")

모델 학습 완료!


In [51]:
model_result = model.wv.most_similar("man")
print(model_result)

[('woman', 0.9233373999595642), ('skull', 0.911032497882843), ('Bang', 0.9056490063667297), ('asteroid', 0.9051957130432129), ('third', 0.9020178318023682), ('baby', 0.8993921279907227), ('dog', 0.8985978364944458), ('bought', 0.8975234031677246), ('rally', 0.8912491798400879), ('disc', 0.8888981342315674)]


In [52]:
from gensim.models import KeyedVectors

model.wv.save_word2vec_format('~/aiffel/word_embedding/w2v') 
loaded_model = KeyedVectors.load_word2vec_format("~/aiffel/word_embedding/w2v")
print("모델  load 완료!")

모델  load 완료!


In [53]:
model_result = loaded_model.most_similar("man")
print(model_result)

[('woman', 0.9233373999595642), ('skull', 0.911032497882843), ('Bang', 0.9056490063667297), ('asteroid', 0.9051957130432129), ('third', 0.9020178318023682), ('baby', 0.8993921279907227), ('dog', 0.8985978364944458), ('bought', 0.8975234031677246), ('rally', 0.8912491798400879), ('disc', 0.8888981342315674)]


In [54]:
# 에러가 나더라도 놀라지 마세요.
loaded_model.most_similar('overacting')

KeyError: "Key 'overacting' not present"

In [55]:
loaded_model.most_similar('memorry')

KeyError: "Key 'memorry' not present"

In [61]:
loaded_model.most_similar('computer')

[('alternative', 0.919630765914917),
 ('create', 0.9108300805091858),
 ('software', 0.9041945934295654),
 ('fit', 0.8978995680809021),
 ('car', 0.8967701196670532),
 ('drive', 0.8963003158569336),
 ('search', 0.8937243819236755),
 ('model', 0.892130970954895),
 ('measure', 0.8913665413856506),
 ('network', 0.8895030617713928)]

## Fastext

In [62]:
from gensim.models import FastText
fasttext_model = FastText(corpus, window=5, min_count=5, workers=4, sg=1)
print("FastText 학습 완료!")

FastText 학습 완료!


In [63]:
fasttext_model.wv.most_similar('overacting')

[('extracting', 0.9433786869049072),
 ('declining', 0.9374690651893616),
 ('lifting', 0.9372662305831909),
 ('fluctuating', 0.9369775056838989),
 ('attracting', 0.9337378740310669),
 ('weakening', 0.9320513606071472),
 ('malting', 0.93160480260849),
 ('resolving', 0.9312703013420105),
 ('debilitating', 0.9311731457710266),
 ('shooting', 0.9300152063369751)]

In [66]:
fasttext_model.wv.most_similar('memorry')

[('memory', 0.9387076497077942),
 ('xenotourism', 0.9215632081031799),
 ('tertiary', 0.9198737144470215),
 ('terrorism', 0.9171770215034485),
 ('counterintuitive', 0.915630042552948),
 ('consciousness', 0.9123710989952087),
 ('tourism', 0.907857358455658),
 ('memorandum', 0.901509165763855),
 ('emphasis', 0.9001284837722778),
 ('happiness', 0.9000794291496277)]

## Glove

In [67]:
import gensim.downloader as api
glove_model = api.load("glove-wiki-gigaword-50")  # glove vectors 다운로드
glove_model.most_similar("dog")  # 'dog'과 비슷한 단어 찾기

[('cat', 0.9218004941940308),
 ('dogs', 0.8513158559799194),
 ('horse', 0.7907583713531494),
 ('puppy', 0.7754920721054077),
 ('pet', 0.7724708318710327),
 ('rabbit', 0.7720814347267151),
 ('pig', 0.7490062117576599),
 ('snake', 0.7399188876152039),
 ('baby', 0.7395570278167725),
 ('bite', 0.7387937307357788)]

In [68]:
glove_model.most_similar('overacting')

[('impudence', 0.7842012047767639),
 ('puerile', 0.7816032767295837),
 ('winningly', 0.7644237875938416),
 ('grossness', 0.7576098442077637),
 ('deconstructions', 0.748936653137207),
 ('over-the-top', 0.7460805773735046),
 ('buffoonery', 0.746045708656311),
 ('impetuosity', 0.7415392398834229),
 ('sophomoric', 0.736961841583252),
 ('zaniness', 0.7353197336196899)]

In [69]:
glove_model.most_similar('memoryy')

KeyError: "Key 'memoryy' not present"