In [28]:
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import codecs
from konlpy.tag import Twitter
from gensim.models import word2vec
from konlpy.utils import pprint
from sklearn.manifold import TSNE
from konlpy.tag import Okt

In [9]:
sentences = [
                ['this', 'is', 'a',   'good',      'product'],
                ['it',   'is', 'a',   'excellent', 'product'],
                ['it',   'is', 'a',   'bad',       'product'],
                ['that', 'is', 'the', 'worst',     'product']
            ]

# 문장을 이용하여 단어와 벡터를 생성한다.
model = Word2Vec(sentences, size=5, window=3, min_count=2)

In [10]:
# 단어벡터를 구한다.
word_vectors = model.wv

In [11]:
print(word_vectors["this"])
print(word_vectors["the"])

[-0.03467268 -0.02284146 -0.05520487  0.09286432 -0.07780986]
[-0.08328155  0.09069994  0.02411258 -0.03890152  0.00899796]


In [12]:
vocabs = word_vectors.vocab.keys()
print(vocabs)

dict_keys(['this', 'is', 'a', 'good', 'product', 'it', 'excellent', 'bad', 'that', 'the', 'worst'])


In [13]:
word_vectors_list = [word_vectors[v] for v in vocabs]
print(word_vectors_list)

[array([-0.03467268, -0.02284146, -0.05520487,  0.09286432, -0.07780986],
      dtype=float32), array([-0.07090758,  0.01385371, -0.07457203,  0.03248972, -0.00561957],
      dtype=float32), array([-0.06244394, -0.02554464,  0.07506856,  0.05721265,  0.00461775],
      dtype=float32), array([-0.0053714 , -0.03891823,  0.07421409,  0.0246501 ,  0.09653606],
      dtype=float32), array([ 0.02971243,  0.05420927, -0.05840307, -0.09615362, -0.07145891],
      dtype=float32), array([-0.06298639,  0.06858068, -0.05275228, -0.0385045 , -0.00345044],
      dtype=float32), array([ 0.02155421,  0.04375936,  0.03116042, -0.06709851,  0.08650323],
      dtype=float32), array([-0.06046551, -0.07812412,  0.09120128,  0.01563613, -0.08869836],
      dtype=float32), array([0.04990771, 0.04529371, 0.01736359, 0.01791992, 0.07686808],
      dtype=float32), array([-0.08328155,  0.09069994,  0.02411258, -0.03890152,  0.00899796],
      dtype=float32), array([-0.0255856 , -0.01889793, -0.04528805, -0.01208

In [19]:
print(model.wv.similarity(w1='it', w2='the'))

0.778262


In [20]:
model.wv.most_similar('it')

[('the', 0.7782620191574097),
 ('is', 0.6549232602119446),
 ('product', 0.5309462547302246),
 ('worst', 0.40462198853492737),
 ('excellent', 0.1621410995721817),
 ('this', 0.014013886451721191),
 ('that', -0.15937910974025726),
 ('a', -0.30257630348205566),
 ('bad', -0.36147594451904297),
 ('good', -0.5077275633811951)]

In [23]:
#파일 읽기 함수. 첫줄 헤더를 제외하고 한 줄씩 읽어서 data 에 담아서 리턴 한다.
def read_data(filename):
    with open(filename, encoding='utf-8', mode='r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data

ratings_train = read_data('ratings_train.txt')

In [25]:
print(ratings_train[0])

['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0']


In [29]:
okt = Okt()
def tokens(doc):
    return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=True)]

In [30]:
tokens('아 더빙.. 진짜 짜증나네요 목소리')

['아/Exclamation',
 '더빙/Noun',
 '../Punctuation',
 '진짜/Noun',
 '짜증나다/Adjective',
 '목소리/Noun']

In [31]:
# 파일중에서 영화 리뷰 데이타만 담기
docs = []
for row in ratings_train:
    docs.append(row[1])
print(docs[:10])

['아 더빙.. 진짜 짜증나네요 목소리', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '너무재밓었다그래서보는것을추천한다', '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', '막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.', '원작의 긴장감을 제대로 살려내지못했다.', '별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단 낫겟다 납치.감금만반복반복..이드라마는 가족도없다 연기못하는사람만모엿네', '액션이 없는데도 재미 있는 몇안되는 영화', '왜케 평점이 낮은건데? 꽤 볼만한데.. 헐리우드식 화려함에만 너무 길들여져 있나?']


In [32]:
data = [tokens(d) for d in docs]

In [42]:
w2v_model = word2vec.Word2Vec(data, size=100, window=3)

In [34]:
w2v_model.save('naver.model')

In [37]:
vocabs = w2v_model.wv.vocab.keys()
print(vocabs)



In [39]:
print(w2v_model.wv.most_similar(positive=tokens(u'남자 여배우'), 
        negative=tokens(u'배우'), topn=5))

[('여자/Noun', 0.8482788801193237), ('아빠/Noun', 0.7352777123451233), ('여자애/Noun', 0.7297647595405579), ('엄마/Noun', 0.7072813510894775), ('할아버지/Noun', 0.7058411836624146)]


In [50]:
print(w2v_model.wv.most_similar(tokens('엄마')))

[('아빠/Noun', 0.9358000755310059), ('할머니/Noun', 0.8420028686523438), ('아버지/Noun', 0.8368518948554993), ('여자친구/Noun', 0.8254244327545166), ('친구/Noun', 0.8077598214149475), ('할아버지/Noun', 0.7785784006118774), ('아내/Noun', 0.7759729027748108), ('딸/Noun', 0.7678540349006653), ('언니/Noun', 0.7663094997406006), ('남자/Noun', 0.7553814649581909)]


In [51]:
vocabs = w2v_model.wv.vocab.keys()
print(len(vocabs))

15409


In [63]:
vocab = list(w2v_model.wv.vocab)
print(vocab)

['아/Exclamation', '더빙/Noun', '../Punctuation', '진짜/Noun', '짜증나다/Adjective', '목소리/Noun', '흠/Noun', '.../Punctuation', '포스터/Noun', '보고/Noun', '초딩/Noun', '영화/Noun', '줄/Noun', '..../Punctuation', '오버/Noun', '연기/Noun', '조차/Josa', '가볍다/Adjective', '않다/Verb', '너/Modifier', '다그/Noun', '래서/Noun', '보다/Verb', '추천/Noun', '한/Josa', '다/Adverb', '교도소/Noun', '이야기/Noun', '구먼/Noun', '솔직하다/Adjective', '재미/Noun', '는/Josa', '없다/Adjective', '평점/Noun', '조정/Noun', '사이/Modifier', '그/Determiner', '의/Noun', '익살스럽다/Adjective', '가/Josa', '돋보이다/Verb', '!/Punctuation', '스파이더맨/Noun', '에서/Josa', '늙다/Verb', '보이다/Verb', '하다/Verb', '너무나도/Adverb', '이쁘다/Adjective', '막/Noun', '걸음/Noun', '마/Noun', '떼다/Verb', '3/Number', '세/Noun', '부터/Josa', '초등학교/Noun', '1/Number', '학년/Noun', '생인/Noun', '8/Number', '살다/Verb', './Punctuation', 'ㅋㅋㅋ/KoreanParticle', '별/Modifier', '반개/Noun', '도/Josa', '아깝다/Adjective', '움/Noun', '원작/Noun', '의/Josa', '긴장감/Noun', '을/Josa', '제대로/Noun', '살리다/Verb', '별/Noun', '욕/Noun', '나오다/Verb', '이응경/Noun', '연/Modi

In [66]:
X = w2v_model[vocab]
print(X.shape)
print(X)

(15409, 100)
[[-0.14769846 -0.91828215 -0.4038392  ...  1.7311978  -0.49790764
  -0.21951273]
 [-0.06262728 -0.5316614  -1.0069822  ...  0.40733385 -0.43701482
   0.5897199 ]
 [-0.34626752 -0.05545708  0.833893   ... -0.21195911 -0.36753264
   0.39599028]
 ...
 [ 0.02022634 -0.01104259  0.03470806 ...  0.03384475 -0.04041724
   0.04163665]
 [ 0.04520236 -0.05706167  0.0470134  ...  0.0196708   0.03033257
   0.07188033]
 [-0.200063    0.01553888  0.17639294 ...  0.0632912  -0.01531237
   0.3455771 ]]


  """Entry point for launching an IPython kernel.


In [None]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X[:100,:])

In [None]:
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
import matplotlib as mpl

mpl.rcParams['axes.unicode_minus'] = False

plt.scatter(X_tsne[:,0], X_tsne[:,1], c='red')


words = vocab[:10]    
for i, word in enumerate(words):
    plt.text(X_tsne[i,0], X_tsne[i,1 ], word, fontsize=8) 
plt.savefig('out.png', dpi=200)
