In [1]:
from konlpy.tag import Mecab
from konlpy.tag import Okt

In [2]:
mecab = Mecab.Mecab()
okt = Okt()

In [3]:
words = '드디어 설치 완료'
print(mecab.morphs(words))
print()

['드디어', '설치', '완료']



# 사전훈련모델
## 1. w2v

In [1]:
from models.word_eval import WordEmbeddingEvaluator

In [2]:
model = WordEmbeddingEvaluator('/Users/user/Desktop/2023_NLP/word-embeddings/word2vec/word2vec',
                               method = 'word2vec',dim=100,
                               tokenizer_name="mecab")

In [21]:
model.most_similar('비료',topn=5)

[('원료', 0.8198371383920555),
 ('석유화학공업', 0.8138662623617606),
 ('공업품', 0.7933079173052008),
 ('화학제품', 0.7791531508633451),
 ('석탄', 0.765735925204949)]

In [4]:
print(model.most_similar('물',topn=5))
print(model.most_similar('불',topn=5))

[('버무렸', 0.724304), ('망처', 0.72103703), ('유리컵', 0.7150936), ('부어도', 0.7124469), ('탱글탱글', 0.71235305)]
[('가능', 0.70852953), ('확실', 0.6523798), ('지펴', 0.6465125), ('해지므로', 0.63637966), ('충분', 0.6193005)]


In [5]:
model.visualize_words("kor2vec/test_dataset/kor_analogy_semantic.txt")



save @ /notebooks/embedding/words.png


In [32]:
a = model.get_word_vector('비료')

In [33]:
b = model.get_word_vector('농업')
c = model.get_word_vector('제조업')
d = model.get_word_vector('수산업')

In [34]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

In [35]:
print(cos_sim(a,b))
print(cos_sim(a,c))
print(cos_sim(a,d))

0.65530974
0.7020238
0.7016469


## 2. Fasttext

### 2-1. fastText with 자음 모음

In [6]:
from preprocess import jamo_sentence, get_tokenizer
tokenizer = get_tokenizer('mecab')
tokens = " ".join(tokenizer.morphs('나는 학교에 간다'))
print(jamo_sentence(tokens))

ㄴㅏ- ㄴㅡㄴ ㅎㅏㄱㄱㅛ- ㅇㅔ- ㄱㅏㄴㄷㅏ-


In [7]:
model = WordEmbeddingEvaluator(
    vecs_txt_fname='/Users/user/Desktop/2023_NLP/word-embeddings/fasttext-jamo/fasttext-jamo.vec',
    vecs_bin_fname='/Users/user/Desktop/2023_NLP/word-embeddings/fasttext-jamo/fasttext-jamo.bin',
                               method = 'fasttext-jamo',dim=100,
                               tokenizer_name="mecab")



In [8]:
model.most_similar('비료')

[('원료', 0.8198371383920555),
 ('석유화학공업', 0.8138662623617606),
 ('공업품', 0.7933079173052008),
 ('화학제품', 0.7791531508633451),
 ('석탄', 0.765735925204949),
 ('석유', 0.7600546328941455),
 ('원료당', 0.7566586423317696),
 ('쇳물', 0.753974018364821),
 ('중금속', 0.7539163930915063),
 ('합금철', 0.7514068973326935)]

In [10]:
model.most_similar('비비료')

[('원료당', 0.6944668306135714),
 ('원료', 0.6942631233930889),
 ('석유화학공업', 0.6922026412091582),
 ('전기료', 0.6852845969385832),
 ('질산은', 0.6794218106472224),
 ('화학차', 0.6737999503018886),
 ('석유', 0.6706594940145969),
 ('공업품', 0.6691117240971816),
 ('비', 0.6690784028391463),
 ('텅비', 0.6680756734331119)]

In [65]:
model.most_similar('화학 비료 혼합물')

[('화합물', 0.8709017266692427),
 ('착화합물', 0.8708513858764897),
 ('질산암모늄', 0.8665625783891215),
 ('탄산암모늄', 0.8631371776414304),
 ('혼합물', 0.8614903008517234),
 ('화학제품', 0.8604260007781857),
 ('수산화물', 0.8601242225638144),
 ('화학', 0.8573555852467648),
 ('고분자화합물', 0.8559497465500914),
 ('탄화칼슘', 0.851602213254578)]

### 2-2. facebook fastText

In [17]:
from gensim.models.fasttext import load_facebook_model
ft_model = load_facebook_model('cc.ko.300.bin')

In [21]:
for i in ['물','불']:
    for w,sim in ft_model.wv.similar_by_word(i,5):
        print(f'{w}: {sim}')

勿: 0.6441857814788818
긷는: 0.6348398923873901
듀파: 0.6208381652832031
물과: 0.6106287837028503
물을: 0.6070572733879089
누비이: 0.6207259893417358
고흔: 0.6122905015945435
불을: 0.5806412696838379
弗: 0.5779265761375427
시팅: 0.5665968656539917


In [31]:
print(ft_model.wv.similarity('농업','비료'))
print(ft_model.wv.similarity('제조업','비료'))
print(ft_model.wv.similarity('수산업','비료'))
print(ft_model.wv.similarity('화학','비료'))
print(ft_model.wv.similarity('사료','비료'))

0.4532578
0.26094204
0.33700103
0.43837753
0.42647022


In [51]:
print(ft_model.wv.similarity('물','불'))
print(ft_model.wv.similarity('불','불꽃'))
print(ft_model.wv.similarity('불','화재'))
print(ft_model.wv.similarity('불','꽃'))
print(ft_model.wv.similarity('불','를'))

0.41942394
0.44357845
0.38826478
0.26769632
0.16419122
