<a href="https://colab.research.google.com/github/sudht/NLP/blob/master/week8_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/gdrive", force_remount=True)

Mounted at /gdrive


<h2>Skip-gram을 이용한 Word2Vec</h2>

![실습 그림](http://nlp.kangwon.ac.kr/~nlpdemo/skip_gram.png)  


In [2]:
import os
import numpy as np
import tensorflow as tf
from tqdm import tqdm

dir = '/gdrive/My Drive/colab/week8'

In [3]:
import numpy as np

# dictionary 생성
idx2word = {0: "One", 1: "Two", 2: "Three", 3: "Four", 4: "Five",
                       5: "Six", 6: "Seven", 7: "Eight", 8: "Nine"}
word2idx = {"One":0,"Two":1,"Three":2,"Four":3,"Five":4,"Six":5,"Seven":6,"Eight":7,"Nine":8}

# 난수로 데이터 생성하는 함수
def make_data():
  gen_datas = []
  datas = []
  for i in range(10000):
    # ex)
    #   odd_instance : "One Five Nine"
    #   even_instance : "Four Eight Two"
    odd_instance = " ".join([idx2word[e-1] for e in np.random.choice(range(1, 10, 2), size=3)])
    even_instance = " ".join([idx2word[e-1] for e in np.random.choice(range(2, 10, 2), size=3)])
    
    gen_datas.append(odd_instance)
    gen_datas.append(even_instance)

  for instance_num in tqdm(range(len(gen_datas))):
    line = gen_datas[instance_num].split()
    # ex)
    #   line : ["One", "Five", "Nine"]
    #   datas : [(One, [Five]), (Five, [One]), (Five, [Nine]), ... ]
    #   위 datas는 잘못된 예시임, (0, [5]) -> 이런 식으로 되어 있어야 함.
    #   주변 단어에 대한 datas 생성

    # 1. 위의 예시와 같이 line을 전처리 하여 datas에 추가하는 코드 작성
    # datas는 list니깐 append 사용
    ########################################################################
    # 내 코드
    # for idx, data in enumerate(line):
    #   if(idx-1 >= 0):
    #     datas.append((data,[line[idx-1]]))
    #   if(idx+1 < len(line)):
    #     datas.append((data,[line[idx+1]]))

    # 정답 코드
    for idx in range(len(line)):
      if idx != 0:
        datas.append((word2idx[line[idx]], [word2idx[line[idx-1]]]))
      if idx != len(line)-1:
        datas.append((word2idx[line[idx]], [word2idx[line[idx+1]]]))
    ########################################################################
  return datas

# batch 단위로 나누어 데이터 반환
def get_batch(datas, batch_size):
  batches, inputs, outputs = [], [], []
  
  for input_idx, output_idx in datas:
    inputs.append(input_idx)
    outputs.append(output_idx)
    
    if len(inputs) == batch_size:
      batches.append((inputs, outputs))
      inputs = []
      outputs = []
  # batches : [([5,1,4,... ], [[4], [1], [6], ... ]), ([7, 6, 1, ... ], [[0], [1], [9], ... ]), ....]
  return batches

if __name__ == '__main__':
  datas = make_data()
  print(np.shape(datas))
  batches = get_batch(datas, 64)
  print(np.shape(batches))
  print(datas[0:10])
  print(batches[0])

100%|██████████| 20000/20000 [00:00<00:00, 69537.16it/s]


(80000, 2)
(1250, 2, 64)
[(2, [8]), (8, [2]), (8, [4]), (4, [8]), (7, [1]), (1, [7]), (1, [1]), (1, [1]), (2, [8]), (8, [2])]
([2, 8, 8, 4, 7, 1, 1, 1, 2, 8, 8, 0, 1, 1, 1, 1, 6, 4, 4, 6, 1, 7, 7, 5, 8, 2, 2, 6, 7, 3, 3, 3, 8, 4, 4, 2, 5, 5, 5, 5, 2, 6, 6, 8, 5, 1, 1, 3, 4, 2, 2, 6, 1, 1, 1, 5, 8, 8, 8, 2, 3, 7, 7, 5], [[8], [2], [4], [8], [1], [7], [1], [1], [8], [2], [0], [8], [1], [1], [1], [1], [4], [6], [6], [4], [7], [1], [5], [7], [2], [8], [6], [2], [3], [7], [3], [3], [4], [8], [2], [4], [5], [5], [5], [5], [6], [2], [8], [6], [1], [5], [3], [1], [2], [4], [6], [2], [1], [1], [5], [1], [8], [8], [2], [8], [7], [3], [5], [7]])


![대체 텍스트](http://nlp.kangwon.ac.kr/~nlpdemo/word2vec.PNG)

In [0]:
class Word2Vec():
  def __init__(self, flags):
    # flags : 하이퍼 파라미터

    # 입력/출력 placeholder 선언
    self.inputs = tf.placeholder(tf.int32, shape=[None], name='input_idx')
    self.labels = tf.placeholder(tf.int32, shape=[None, 1], name='output_idx')
    
    # 하이퍼 파라미터 저장
    self.embedding_size = flags['embedding_size']
    self.vocab_size = flags['vocab_size']
    self.learning_rate = flags['learning_rate']
    self.num_samples = flags['num_samples']
    
    # build graph
    self._embedding_init()
    self._input_init()
    self._train_init()
  
  def _embedding_init(self):
    with tf.name_scope('embeddings'):
      # projection matrix
      self.embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0))
      
  def _input_init(self):
    self.lookup_input = tf.nn.embedding_lookup(self.embeddings, self.inputs)
  
  def _train_init(self):
    nce_weights = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0))
    nce_biases = tf.Variable(tf.zeros([self.vocab_size]))

    self.loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,labels=self.labels, inputs=self.lookup_input,
                       num_sampled=self.num_samples, num_classes=self.vocab_size))
    self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)


In [0]:
import random
import numpy as np

def train(flags):
  # 데이터 생성 및 저장
  train_datas = make_data()

  # 모델 선언
  model = Word2Vec(flags)

  # session 설절
  sess_config = tf.ConfigProto(allow_soft_placement=True, 
                               gpu_options=tf.GPUOptions(allow_growth=True))

  # tensorflow를 실행하기 위한 session
  with tf.Session(config=sess_config) as sess:
    # 그래프 초기화
    sess.run(tf.global_variables_initializer())
    # 학습 파일을 저장거나 불러오기 위한 saver 객체
    saver = tf.train.Saver()
    
    for epoch in tqdm(range(flags['epoch'])):
      # epoch 마다 data shuffle
      random.shuffle(train_datas)
      train_batches = get_batch(train_datas, flags['batch_size'])
      for step in range(len(train_batches)):
        train_input, train_label = train_batches[step]
        
        loss, _ = sess.run([model.loss, model.train_op], feed_dict = {
            model.inputs : train_input,
            model.labels : train_label
            
        })
        
        if (step+1) % flags['checkpoint_step'] == 0:
          print("Epoch : {}\tStep : {} / {}\tCurrent Loss : {}".format(epoch+1, step+1, len(train_batches), loss))
      if (epoch+1) % flags['checkpoint_batch'] == 0:
        filename = os.path.join(flags["save_dir"], "model_{}.ckpt".format(epoch+1))
        saver.save(sess, filename)
        
    

In [0]:
import numpy as np
import operator

def test(flags, word):
  model = Word2Vec(flags)
  sess_config = tf.ConfigProto(allow_soft_placement=True, 
                               gpu_options=tf.GPUOptions(allow_growth=True))

  with tf.Session(config=sess_config) as sess:
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    
    # 학습한 모델 파일 중에서 가장 많이 학습된 파일로부터 가중치를 불러옴
    print("Read from : " + str(tf.train.latest_checkpoint(flags["save_dir"])))
    saver.restore(sess, tf.train.latest_checkpoint(flags["save_dir"]))

    # 학습된 word embedding을 가져옴
    word_embedding = list(sess.run(model.embeddings))
    input_word = word
    input_idx = word2idx[input_word]
    input_vector = word_embedding[input_idx]
    
    # 유사도와 vocab 단어 쌍을 저장하기 위한 리스트
    # ex)
    #   similarity_list : [(0, 0.5), (1, 0.8), ... ]
    similarity_list = []
    
    # 1. 입력 단어와 단어장에 있는 단어들과의 유사도 구하기 (Cosine 유사도 수식, numpy 활용)
    #   ex)
    #     input_word : One
    #     input_idx : 0
    #     input_vector : [0.654, 0.414, -0.654, ... ]
    #     np.dot(A, B)
    ###########################################################
    for idx, e in enumerate(word_embedding):
      if idx == input_idx:
        continue
      sim = np.dot(e, input_vector)
      similarity_list.append((idx, sim))
    ###########################################################

    # 유사도 순으로 내림차순 정렬
    similarity_list = sorted(similarity_list, key=operator.itemgetter(1), reverse=True)
    
    # 유사 단어 출력
    for w in similarity_list[:3]:
      print(idx2word[w[0]])

In [9]:
if __name__ == "__main__":
  
  save_dir = os.path.join(dir, "model")
  if not os.path.exists(save_dir):
      os.makedirs(save_dir)

  flags = {"mode":"train",
           "save_dir":save_dir,
           "batch_size":64,
           "epoch":10,
           "learning_rate":0.01,
           "embedding_size":50,
           "vocab_size":9,
           "num_samples" : 8,
           "checkpoint_step" : 100,
           "checkpoint_batch" : 5
          }
  
  tf.reset_default_graph()
  if(flags["mode"] == "train"):
      train(flags)
  elif(flags["mode"] == "test"):
      flags["batch_size"] = 1  
      flags["keep_prob"] = 1.0
      test(flags, 'Five')
  else:
      print("Unknown mode")
      exit(0)

100%|██████████| 20000/20000 [00:00<00:00, 96209.23it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Epoch : 1	Step : 100 / 1250	Current Loss : 2.576582193374634
Epoch : 1	Step : 200 / 1250	Current Loss : 2.5796260833740234
Epoch : 1	Step : 300 / 1250	Current Loss : 2.60315203666687
Epoch : 1	Step : 400 / 1250	Current Loss : 2.6017422676086426
Epoch : 1	Step : 500 / 1250	Current Loss : 2.5668158531188965
Epoch : 1	Step : 600 / 1250	Current Loss : 2.6019721031188965
Epoch : 1	Step : 700 / 1250	Current Loss : 2.6208906173706055
Epoch : 1	Step : 800 / 1250	Current Loss : 2.57877516746521
Epoch : 1	Step : 900 / 1250	Current Loss : 2.606841802597046
Epoch : 1	Step : 1000 / 1250	Current Loss : 2.5264673233032227
Epoch : 1	Step : 1100 / 1250	Current Loss : 2.511291265487671
Epoch : 1	Step : 1200 / 1250	Current Loss : 2.5020642280578613


 10%|█         | 1/10 [00:01<00:11,  1.33s/it]

Epoch : 2	Step : 100 / 1250	Current Loss : 2.4854350090026855
Epoch : 2	Step : 200 / 1250	Current Loss : 2.4490857124328613
Epoch : 2	Step : 300 / 1250	Current Loss : 2.4568326473236084
Epoch : 2	Step : 400 / 1250	Current Loss : 2.572140693664551
Epoch : 2	Step : 500 / 1250	Current Loss : 2.504519462585449
Epoch : 2	Step : 600 / 1250	Current Loss : 2.5103964805603027
Epoch : 2	Step : 700 / 1250	Current Loss : 2.62953519821167
Epoch : 2	Step : 800 / 1250	Current Loss : 2.495878219604492
Epoch : 2	Step : 900 / 1250	Current Loss : 2.479996919631958
Epoch : 2	Step : 1000 / 1250	Current Loss : 2.4314682483673096
Epoch : 2	Step : 1100 / 1250	Current Loss : 2.544127941131592
Epoch : 2	Step : 1200 / 1250	Current Loss : 2.5064282417297363


 20%|██        | 2/10 [00:02<00:10,  1.30s/it]

Epoch : 3	Step : 100 / 1250	Current Loss : 2.5119829177856445
Epoch : 3	Step : 200 / 1250	Current Loss : 2.560089111328125
Epoch : 3	Step : 300 / 1250	Current Loss : 2.486729621887207
Epoch : 3	Step : 400 / 1250	Current Loss : 2.5939793586730957
Epoch : 3	Step : 500 / 1250	Current Loss : 2.5137686729431152
Epoch : 3	Step : 600 / 1250	Current Loss : 2.5074546337127686
Epoch : 3	Step : 700 / 1250	Current Loss : 2.4948513507843018
Epoch : 3	Step : 800 / 1250	Current Loss : 2.616584300994873
Epoch : 3	Step : 900 / 1250	Current Loss : 2.527844190597534
Epoch : 3	Step : 1000 / 1250	Current Loss : 2.562437057495117
Epoch : 3	Step : 1100 / 1250	Current Loss : 2.4972145557403564
Epoch : 3	Step : 1200 / 1250	Current Loss : 2.578514575958252


 30%|███       | 3/10 [00:03<00:08,  1.28s/it]

Epoch : 4	Step : 100 / 1250	Current Loss : 2.439741611480713
Epoch : 4	Step : 200 / 1250	Current Loss : 2.6071994304656982
Epoch : 4	Step : 300 / 1250	Current Loss : 2.514604091644287
Epoch : 4	Step : 400 / 1250	Current Loss : 2.512619733810425
Epoch : 4	Step : 500 / 1250	Current Loss : 2.5071873664855957
Epoch : 4	Step : 600 / 1250	Current Loss : 2.526927947998047
Epoch : 4	Step : 700 / 1250	Current Loss : 2.58247971534729
Epoch : 4	Step : 800 / 1250	Current Loss : 2.48861026763916
Epoch : 4	Step : 900 / 1250	Current Loss : 2.5583577156066895
Epoch : 4	Step : 1000 / 1250	Current Loss : 2.4899370670318604
Epoch : 4	Step : 1100 / 1250	Current Loss : 2.5404529571533203
Epoch : 4	Step : 1200 / 1250	Current Loss : 2.5748353004455566


 40%|████      | 4/10 [00:05<00:07,  1.28s/it]

Epoch : 5	Step : 100 / 1250	Current Loss : 2.5413126945495605
Epoch : 5	Step : 200 / 1250	Current Loss : 2.5613417625427246
Epoch : 5	Step : 300 / 1250	Current Loss : 2.5253634452819824
Epoch : 5	Step : 400 / 1250	Current Loss : 2.5540192127227783
Epoch : 5	Step : 500 / 1250	Current Loss : 2.520552158355713
Epoch : 5	Step : 600 / 1250	Current Loss : 2.540661573410034
Epoch : 5	Step : 700 / 1250	Current Loss : 2.4728081226348877
Epoch : 5	Step : 800 / 1250	Current Loss : 2.491978645324707
Epoch : 5	Step : 900 / 1250	Current Loss : 2.502162456512451
Epoch : 5	Step : 1000 / 1250	Current Loss : 2.4418091773986816
Epoch : 5	Step : 1100 / 1250	Current Loss : 2.495913028717041
Epoch : 5	Step : 1200 / 1250	Current Loss : 2.513103485107422


 50%|█████     | 5/10 [00:06<00:06,  1.30s/it]

Epoch : 6	Step : 100 / 1250	Current Loss : 2.489645004272461
Epoch : 6	Step : 200 / 1250	Current Loss : 2.5282704830169678
Epoch : 6	Step : 300 / 1250	Current Loss : 2.504115104675293
Epoch : 6	Step : 400 / 1250	Current Loss : 2.5245299339294434
Epoch : 6	Step : 500 / 1250	Current Loss : 2.4327616691589355
Epoch : 6	Step : 600 / 1250	Current Loss : 2.481529712677002
Epoch : 6	Step : 700 / 1250	Current Loss : 2.603593587875366
Epoch : 6	Step : 800 / 1250	Current Loss : 2.5373055934906006
Epoch : 6	Step : 900 / 1250	Current Loss : 2.4136013984680176
Epoch : 6	Step : 1000 / 1250	Current Loss : 2.4926064014434814
Epoch : 6	Step : 1100 / 1250	Current Loss : 2.5948421955108643
Epoch : 6	Step : 1200 / 1250	Current Loss : 2.550572395324707


 60%|██████    | 6/10 [00:07<00:05,  1.28s/it]

Epoch : 7	Step : 100 / 1250	Current Loss : 2.609574556350708
Epoch : 7	Step : 200 / 1250	Current Loss : 2.530120849609375
Epoch : 7	Step : 300 / 1250	Current Loss : 2.5141329765319824
Epoch : 7	Step : 400 / 1250	Current Loss : 2.5133414268493652
Epoch : 7	Step : 500 / 1250	Current Loss : 2.5334134101867676
Epoch : 7	Step : 600 / 1250	Current Loss : 2.5364904403686523
Epoch : 7	Step : 700 / 1250	Current Loss : 2.480271100997925
Epoch : 7	Step : 800 / 1250	Current Loss : 2.5663108825683594
Epoch : 7	Step : 900 / 1250	Current Loss : 2.531285285949707
Epoch : 7	Step : 1000 / 1250	Current Loss : 2.508761405944824
Epoch : 7	Step : 1100 / 1250	Current Loss : 2.504831552505493
Epoch : 7	Step : 1200 / 1250	Current Loss : 2.552656650543213


 70%|███████   | 7/10 [00:08<00:03,  1.26s/it]

Epoch : 8	Step : 100 / 1250	Current Loss : 2.547469139099121
Epoch : 8	Step : 200 / 1250	Current Loss : 2.531250476837158
Epoch : 8	Step : 300 / 1250	Current Loss : 2.5002291202545166
Epoch : 8	Step : 400 / 1250	Current Loss : 2.5091867446899414
Epoch : 8	Step : 500 / 1250	Current Loss : 2.5915489196777344
Epoch : 8	Step : 600 / 1250	Current Loss : 2.5402321815490723
Epoch : 8	Step : 700 / 1250	Current Loss : 2.539841890335083
Epoch : 8	Step : 800 / 1250	Current Loss : 2.5321407318115234
Epoch : 8	Step : 900 / 1250	Current Loss : 2.525205135345459
Epoch : 8	Step : 1000 / 1250	Current Loss : 2.528102397918701
Epoch : 8	Step : 1100 / 1250	Current Loss : 2.5278332233428955
Epoch : 8	Step : 1200 / 1250	Current Loss : 2.506202459335327


 80%|████████  | 8/10 [00:10<00:02,  1.26s/it]

Epoch : 9	Step : 100 / 1250	Current Loss : 2.5127315521240234
Epoch : 9	Step : 200 / 1250	Current Loss : 2.507315158843994
Epoch : 9	Step : 300 / 1250	Current Loss : 2.5484097003936768
Epoch : 9	Step : 400 / 1250	Current Loss : 2.511488914489746
Epoch : 9	Step : 500 / 1250	Current Loss : 2.5173988342285156
Epoch : 9	Step : 600 / 1250	Current Loss : 2.5136427879333496
Epoch : 9	Step : 700 / 1250	Current Loss : 2.581486940383911
Epoch : 9	Step : 800 / 1250	Current Loss : 2.5482120513916016
Epoch : 9	Step : 900 / 1250	Current Loss : 2.521885395050049
Epoch : 9	Step : 1000 / 1250	Current Loss : 2.5422661304473877
Epoch : 9	Step : 1100 / 1250	Current Loss : 2.505429744720459
Epoch : 9	Step : 1200 / 1250	Current Loss : 2.4855856895446777


 90%|█████████ | 9/10 [00:11<00:01,  1.25s/it]

Epoch : 10	Step : 100 / 1250	Current Loss : 2.530263900756836
Epoch : 10	Step : 200 / 1250	Current Loss : 2.536910057067871
Epoch : 10	Step : 300 / 1250	Current Loss : 2.536465883255005
Epoch : 10	Step : 400 / 1250	Current Loss : 2.4859542846679688
Epoch : 10	Step : 500 / 1250	Current Loss : 2.4987032413482666
Epoch : 10	Step : 600 / 1250	Current Loss : 2.5173709392547607
Epoch : 10	Step : 700 / 1250	Current Loss : 2.4975779056549072
Epoch : 10	Step : 800 / 1250	Current Loss : 2.555988311767578
Epoch : 10	Step : 900 / 1250	Current Loss : 2.5574698448181152
Epoch : 10	Step : 1000 / 1250	Current Loss : 2.5444741249084473
Epoch : 10	Step : 1100 / 1250	Current Loss : 2.5752358436584473
Epoch : 10	Step : 1200 / 1250	Current Loss : 2.495375633239746


100%|██████████| 10/10 [00:12<00:00,  1.27s/it]
