In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader , Subset, random_split
from torch.utils.tensorboard import SummaryWriter

In [2]:
import sys
import re
import numpy as np
import pandas as pd
import random
from itertools import chain

import matplotlib.pyplot as plt

In [3]:
import import_ipynb
from preprocessor import *

importing Jupyter notebook from preprocessor.ipynb


In [4]:
from konlpy.tag import *

In [5]:
mecab = Mecab()

## Raw Data

In [6]:
dir_path = '../../Data/'
data = pd.read_excel(dir_path + '한국어_대화체_번역.xlsx' , engine='openpyxl')

In [7]:
data_size = len(data)
data[['원문','번역문']].head()

Unnamed: 0,원문,번역문
0,이번 신제품 출시에 대한 시장의 반응은 어떤가요?,How is the market's reaction to the newly rele...
1,판매량이 지난번 제품보다 빠르게 늘고 있습니다.,The sales increase is faster than the previous...
2,그렇다면 공장에 연락해서 주문량을 더 늘려야겠네요.,"Then, we'll have to call the manufacturer and ..."
3,"네, 제가 연락해서 주문량을 2배로 늘리겠습니다.","Sure, I'll make a call and double the volume o..."
4,지난 회의 마지막에 논의했던 안건을 다시 볼까요?,Shall we take a look at the issues we discusse...


In [8]:
kor_data = data['원문']

In [9]:
kor_encoder = Preprocessor(kor_data, mecab.morphs, th=5)

kor_encoder.build_dict()

In [10]:
kor_idx2word = kor_encoder.get_idx2word()

In [11]:
kor_df = pd.DataFrame({'INDEX' : list(kor_idx2word.keys()), 
                       'TOKEN' : list(kor_idx2word.values())})

In [12]:
kor_df.head()

Unnamed: 0,INDEX,TOKEN
0,0,<PAD>
1,1,<UNK>
2,2,<SOS>
3,3,<EOS>
4,4,서요


In [13]:
kor_df.to_csv('./Embedding/csv/kor_idx2word.csv')

## Preprocessing

In [14]:
kor_idx_data = kor_encoder.encode()

In [15]:
def make_data(idx_data, window_size=9)  :
    
    context_data = []
    
    for i in range(len(idx_data)) :
        idx_list = idx_data[i]
        if len(idx_list) < window_size :
            continue
        for j in range(len(idx_list) - window_size) :
            context_data.append(idx_list[j:j+window_size])
    
    context_data = np.array(context_data)
    random.shuffle(context_data)
    
    mid_point = int(window_size/2)
    cen_data = context_data[:,mid_point]
    neighbor_data = np.hstack([context_data[:,:mid_point], context_data[:,mid_point+1:]])
            
    return cen_data, neighbor_data

In [16]:
window_size = 13
cen_data, neigh_data = make_data(kor_idx_data, window_size)

## Dataset & Dataloader

In [17]:
class EmbeddingDataset(Dataset) :

    def __init__(self, cen_data, neighbor_data, val_ratio=0.1) :
        super(EmbeddingDataset , self).__init__()
        self.c_data = cen_data
        self.n_data = neighbor_data
        self.val_ratio = val_ratio

    def __len__(self) :
        return len(self.c_data)

    def __getitem__(self , idx) :
        return self.c_data[idx], self.n_data[idx]
    
    def split(self) :
        n_val = int(len(self) * self.val_ratio)
        n_train = len(self) - n_val
        train_set, val_set = random_split(self, [n_train, n_val])
        
        return train_set, val_set

In [18]:
batch_size = 1024

dataset = EmbeddingDataset(cen_data, neigh_data)
train_data, val_data = dataset.split()

In [19]:
train_loader = DataLoader(train_data,
                          num_workers=4,
                          shuffle=True,
                          batch_size=batch_size)

val_loader = DataLoader(val_data,
                        num_workers=4,
                        shuffle=False,
                        batch_size=batch_size)

## Device & Seed

In [20]:
use_cuda = torch.cuda.is_available()
random.seed(20210906)
torch.cuda.manual_seed_all(20210906)
device = torch.device("cuda" if use_cuda else "cpu") 

## Model

In [21]:
class SkipGram(nn.Module) :
    
    def __init__(self, em_size, v_size, window_size) :
        super(SkipGram, self).__init__()
        self.em_size = em_size
        self.v_size = v_size
        self.window_size = window_size
    
        self.embedding = nn.Embedding(num_embeddings=v_size,
                                      embedding_dim=em_size,
                                      padding_idx=0)
        self.o_layer = nn.Linear(em_size, v_size*(window_size-1))
        
        self.init_param()
        
    def init_param(self) :
        nn.init.normal_(self.embedding.weight, mean=0.0, std=0.1)
        
        nn.init.xavier_normal_(self.o_layer.weight)
        nn.init.zeros_(self.o_layer.bias)
        
    def forward(self, in_tensor) :
        in_tensor = in_tensor.unsqueeze(1)
        em_tensor = self.embedding(in_tensor)
        
        o_tensor = self.o_layer(em_tensor)
        o_tensor = torch.reshape(o_tensor, (-1,self.window_size-1,self.v_size))
        
        return o_tensor
    

In [22]:
class CBOW(nn.Module) :
    
    def __init__(self, em_size, v_size) :
        super(CBOW, self).__init__()
        self.em_size = em_size
        self.v_size = v_size
    
        self.embedding = nn.Embedding(num_embeddings=v_size,
                                      embedding_dim=em_size,
                                      padding_idx=0)
        self.o_layer = nn.Linear(em_size, v_size)
        
        self.init_param()
        
    def init_param(self) :
        nn.init.normal_(self.embedding.weight, mean=0.0, std=0.1)
        
        nn.init.xavier_normal_(self.o_layer.weight)
        nn.init.zeros_(self.o_layer.bias)
        
    def forward(self, in_tensor) :
        em_tensor = self.embedding(in_tensor)
        h_tensor = torch.mean(em_tensor, dim=1)
        
        o_tensor = self.o_layer(h_tensor)
        
        return o_tensor
    

In [23]:
em_dim = 512
v_size = len(kor_df)

w2v_model = SkipGram(em_dim, v_size, window_size).to(device)

## Optimizer & Scheduler

In [24]:
epoch_size = 20
init_lr = 1e-4

In [25]:
def linear_fn(epoch,lr) :
    
    decay_lr = lr*(epoch/epoch_size)
    
    return (lr-decay_lr)/lr

In [26]:
optimizer = optim.Adam(w2v_model.parameters(), lr=init_lr)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, 
                                        lr_lambda = lambda epoch: linear_fn(epoch, init_lr))

## Acc & Loss Function

In [27]:
def acc_fn(y_output , y_label) :
    y_arg = torch.argmax(y_output, dim=-1)
    y_acc = (y_arg == y_label).float()
    y_acc = torch.mean(y_acc)
    return y_acc

In [28]:
loss_fn = nn.CrossEntropyLoss().to(device)

## Training

In [29]:
min_loss = np.inf
stop_count = 0

In [30]:
def progressLearning(epoch, value, endvalue, bar_length=50):
    percent = float(value + 1) / endvalue
    arrow = '-' * int(round(percent * bar_length)-1) + '>'
    spaces = ' ' * (bar_length - len(arrow))

    sys.stdout.write("\rEpoch [{0}] : [{1}] {2}/{3}".format(epoch, 
                                                            arrow + spaces, 
                                                            value+1 , 
                                                            endvalue))
    sys.stdout.flush()

In [31]:
def evaluate(model, test_loader) :
    with torch.no_grad() :
        model.eval()
        loss_eval = 0.0
        acc_eval = 0.0
    
        for cen_data, neigh_data in test_loader :
            cen_data = cen_data.long().to(device)
            neigh_data = neigh_data.long().to(device)

            neigh_out = model(cen_data)
            
            neigh_out = torch.reshape(neigh_out, (-1,v_size))
            neigh_data = torch.reshape(neigh_data, (-1,))
            
            loss_eval += loss_fn(neigh_out , neigh_data)
            acc_eval += acc_fn(neigh_out , neigh_data)

        model.train()
        loss_eval /= len(test_loader)
        acc_eval /= len(test_loader)
        
    return loss_eval , acc_eval  

In [32]:
for epoch in range(epoch_size) :
    idx = 0

    for cen_data, neigh_data in train_loader :
        cen_data = cen_data.long().to(device)
        neigh_data = neigh_data.long().to(device)

        neigh_out = w2v_model(cen_data)
        
        neigh_out = torch.reshape(neigh_out, (-1,v_size))
        neigh_data = torch.reshape(neigh_data, (-1,))
            
        loss = loss_fn(neigh_out , neigh_data)
        acc = acc_fn(neigh_out , neigh_data)
        
        loss.backward()
        optimizer.step()
        
        progressLearning(epoch, idx, len(train_loader))
        idx += 1

    val_loss, val_acc = evaluate(w2v_model, val_loader)
    
    if val_loss < min_loss :
        min_loss = val_loss
        torch.save({'epoch' : (epoch) ,  
                    'model_state_dict' : w2v_model.state_dict() , 
                    'loss' : val_loss.item() , 
                    'acc' : val_acc.item()} , 
                    f'./Embedding/model/checkpoint_w2v_korean.pt')        
        stop_count = 0 
    else :
        stop_count += 1
        if stop_count >= 5 :      
            print('\tTraining Early Stopped')
            break
            
    scheduler.step()
    print('\tVal Loss : %.3f \t Val Accuracy : %.3f' %(val_loss, val_acc))




Epoch [0] : [------------------------------------------------->] 413/413	Val Loss : 6.369 	 Val Accuracy : 0.064
Epoch [1] : [------------------------------------------------->] 413/413	Val Loss : 6.321 	 Val Accuracy : 0.069
Epoch [2] : [------------------------------------------------->] 413/413	Val Loss : 6.075 	 Val Accuracy : 0.061
Epoch [3] : [------------------------------------------------->] 413/413	Val Loss : 5.741 	 Val Accuracy : 0.074
Epoch [4] : [------------------------------------------------->] 413/413	Val Loss : 5.585 	 Val Accuracy : 0.090
Epoch [5] : [------------------------------------------------->] 413/413	Val Loss : 5.470 	 Val Accuracy : 0.094
Epoch [6] : [------------------------------------------------->] 413/413	Val Loss : 5.380 	 Val Accuracy : 0.101
Epoch [7] : [------------------------------------------------->] 413/413	Val Loss : 5.319 	 Val Accuracy : 0.104
Epoch [8] : [------------------------------------------------->] 413/413	Val Loss : 5.288 	 Val 

In [33]:
em_weight = w2v_model.embedding.weight
o_weight = w2v_model.o_layer.weight.view(-1,window_size-1,em_dim)
o_weight = torch.mean(o_weight, dim=1)

o_bias = w2v_model.o_layer.bias.view(-1,window_size-1)
o_bias = torch.mean(o_bias, dim=1)

In [34]:
kor_bias = o_bias.detach().cpu().numpy()

kor_weight = (em_weight + o_weight)/2
kor_weight = kor_weight.detach().cpu().numpy()
kor_weight[0] = 0.0

In [35]:
np.save('./Embedding/array/kor_weight.npy', kor_weight)
np.save('./Embedding/array/kor_bias.npy', kor_bias)