In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader , Subset, random_split
from torch.utils.tensorboard import SummaryWriter

In [2]:
import sys
import re
import numpy as np
import pandas as pd
import random
from itertools import chain

import matplotlib.pyplot as plt

In [3]:
import import_ipynb
from preprocessor import *

importing Jupyter notebook from preprocessor.ipynb


In [4]:
from nltk.tokenize import word_tokenize

## Raw Data

In [5]:
dir_path = '../../Data/'
data = pd.read_excel(dir_path + '한국어_대화체_번역.xlsx' , engine='openpyxl')

In [6]:
data_size = len(data)
data[['원문','번역문']].head()

Unnamed: 0,원문,번역문
0,이번 신제품 출시에 대한 시장의 반응은 어떤가요?,How is the market's reaction to the newly rele...
1,판매량이 지난번 제품보다 빠르게 늘고 있습니다.,The sales increase is faster than the previous...
2,그렇다면 공장에 연락해서 주문량을 더 늘려야겠네요.,"Then, we'll have to call the manufacturer and ..."
3,"네, 제가 연락해서 주문량을 2배로 늘리겠습니다.","Sure, I'll make a call and double the volume o..."
4,지난 회의 마지막에 논의했던 안건을 다시 볼까요?,Shall we take a look at the issues we discusse...


In [7]:
en_data = data['번역문']

In [8]:
en_encoder = Preprocessor(en_data, word_tokenize, th=3)

en_encoder.build_dict()

In [9]:
en_idx2word = en_encoder.get_idx2word()

In [10]:
en_df = pd.DataFrame({'INDEX' : list(en_idx2word.keys()), 
                      'TOKEN' : list(en_idx2word.values())})

In [11]:
en_df.head()

Unnamed: 0,INDEX,TOKEN
0,0,<PAD>
1,1,<UNK>
2,2,<SOS>
3,3,<EOS>
4,4,customer


In [12]:
en_df.to_csv('./Embedding/csv/en_idx2word.csv')

## Preprocessing

In [13]:
en_idx_data = en_encoder.encode()

In [14]:
def make_data(idx_data, window_size=9)  :
    
    context_data = []
    
    for i in range(len(idx_data)) :
        idx_list = idx_data[i]
        if len(idx_list) < window_size :
            continue
        for j in range(len(idx_list) - window_size) :
            context_data.append(idx_list[j:j+window_size])
    
    context_data = np.array(context_data)
    random.shuffle(context_data)
    
    mid_point = int(window_size/2)
    cen_data = context_data[:,mid_point]
    neighbor_data = np.hstack([context_data[:,:mid_point], context_data[:,mid_point+1:]])
            
    return cen_data, neighbor_data

In [15]:
window_size = 13
cen_data, neigh_data = make_data(en_idx_data, 13)

## Dataset & Dataloader

In [16]:
class EmbeddingDataset(Dataset) :

    def __init__(self, cen_data, neighbor_data, val_ratio=0.1) :
        super(EmbeddingDataset , self).__init__()
        self.c_data = cen_data
        self.n_data = neighbor_data
        self.val_ratio = val_ratio

    def __len__(self) :
        return len(self.c_data)

    def __getitem__(self , idx) :
        return self.c_data[idx], self.n_data[idx]
    
    def split(self) :
        n_val = int(len(self) * self.val_ratio)
        n_train = len(self) - n_val
        train_set, val_set = random_split(self, [n_train, n_val])
        
        return train_set, val_set

In [17]:
batch_size = 1024

dataset = EmbeddingDataset(cen_data, neigh_data)
train_data, val_data = dataset.split()

In [18]:
train_loader = DataLoader(train_data,
                          num_workers=4,
                          shuffle=True,
                          batch_size=batch_size)

val_loader = DataLoader(val_data,
                        num_workers=4,
                        shuffle=False,
                        batch_size=batch_size)

## Device & Seed

In [19]:
use_cuda = torch.cuda.is_available()
random.seed(20210906)
torch.cuda.manual_seed_all(20210906)
device = torch.device("cuda" if use_cuda else "cpu") 

## Model

In [32]:
class SkipGram(nn.Module) :
    
    def __init__(self, em_size, v_size, window_size) :
        super(SkipGram, self).__init__()
        self.em_size = em_size
        self.v_size = v_size
        self.window_size = window_size
    
        self.embedding = nn.Embedding(num_embeddings=v_size,
                                      embedding_dim=em_size,
                                      padding_idx=0)
        self.o_layer = nn.Linear(em_size, v_size*(window_size-1))
        
        self.init_param()
        
    def init_param(self) :
        nn.init.normal_(self.embedding.weight, mean=0.0, std=0.1)
        
        nn.init.xavier_normal_(self.o_layer.weight)
        nn.init.zeros_(self.o_layer.bias)
        
    def forward(self, in_tensor) :
        in_tensor = in_tensor.unsqueeze(1)
        em_tensor = self.embedding(in_tensor)
        
        o_tensor = self.o_layer(em_tensor)
        o_tensor = torch.reshape(o_tensor, (-1,self.window_size-1,self.v_size))
        
        return o_tensor
    

In [33]:
class CBOW(nn.Module) :
    
    def __init__(self, em_size, v_size) :
        super(CBOW, self).__init__()
        self.em_size = em_size
        self.v_size = v_size
    
        self.embedding = nn.Embedding(num_embeddings=v_size,
                                      embedding_dim=em_size,
                                      padding_idx=0)
        self.o_layer = nn.Linear(em_size, v_size)
        
        self.init_param()
        
    def init_param(self) :
        nn.init.normal_(self.embedding.weight, mean=0.0, std=0.1)
        
        nn.init.xavier_normal_(self.o_layer.weight)
        nn.init.zeros_(self.o_layer.bias)
        
    def forward(self, in_tensor) :
        em_tensor = self.embedding(in_tensor)
        h_tensor = torch.mean(em_tensor, dim=1)
        
        o_tensor = self.o_layer(h_tensor)
        
        return o_tensor
    

In [34]:
em_dim = 512
v_size = len(en_df)

w2v_model = SkipGram(em_dim, v_size, window_size).to(device)

## Optimizer & Scheduler

In [35]:
epoch_size = 20
init_lr = 1e-4

In [36]:
def linear_fn(epoch,lr) :
    
    decay_lr = lr*(epoch/epoch_size)
    
    return (lr-decay_lr)/lr

In [37]:
optimizer = optim.Adam(w2v_model.parameters(), lr=init_lr)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, 
                                        lr_lambda = lambda epoch: linear_fn(epoch, init_lr))

## Acc & Loss Function

In [38]:
def acc_fn(y_output , y_label) :
    y_arg = torch.argmax(y_output, dim=-1)
    y_acc = (y_arg == y_label).float()
    y_acc = torch.mean(y_acc)
    return y_acc

In [39]:
loss_fn = nn.CrossEntropyLoss().to(device)

## Training

In [40]:
min_loss = np.inf
stop_count = 0

In [41]:
def progressLearning(epoch, value, endvalue, bar_length=50):
    percent = float(value + 1) / endvalue
    arrow = '-' * int(round(percent * bar_length)-1) + '>'
    spaces = ' ' * (bar_length - len(arrow))

    sys.stdout.write("\rEpoch [{0}] : [{1}] {2}/{3}".format(epoch, 
                                                            arrow + spaces, 
                                                            value+1 , 
                                                            endvalue))
    sys.stdout.flush()

In [42]:
def evaluate(model, test_loader) :
    with torch.no_grad() :
        model.eval()
        loss_eval = 0.0
        acc_eval = 0.0
    
        for cen_data, neigh_data in test_loader :
            cen_data = cen_data.long().to(device)
            neigh_data = neigh_data.long().to(device)

            neigh_out = model(cen_data)
            
            neigh_out = torch.reshape(neigh_out, (-1,v_size))
            neigh_data = torch.reshape(neigh_data, (-1,))
            
            loss_eval += loss_fn(neigh_out , neigh_data)
            acc_eval += acc_fn(neigh_out , neigh_data)

        model.train()
        loss_eval /= len(test_loader)
        acc_eval /= len(test_loader)
        
    return loss_eval , acc_eval  

In [43]:
for epoch in range(epoch_size) :
    idx = 0

    for cen_data, neigh_data in train_loader :
        cen_data = cen_data.long().to(device)
        neigh_data = neigh_data.long().to(device)

        neigh_out = w2v_model(cen_data)
        
        neigh_out = torch.reshape(neigh_out, (-1,v_size))
        neigh_data = torch.reshape(neigh_data, (-1,))
            
        loss = loss_fn(neigh_out , neigh_data)
        acc = acc_fn(neigh_out , neigh_data)
        
        loss.backward()
        optimizer.step()
        
        progressLearning(epoch, idx, len(train_loader))
        idx += 1

    val_loss, val_acc = evaluate(w2v_model, val_loader)
    
    if val_loss < min_loss :
        min_loss = val_loss
        torch.save({'epoch' : (epoch) ,  
                    'model_state_dict' : w2v_model.state_dict() , 
                    'loss' : val_loss.item() , 
                    'acc' : val_acc.item()} , 
                    f'./Embedding/model/checkpoint_w2v_english.pt')        
        stop_count = 0 
    else :
        stop_count += 1
        if stop_count >= 5 : 
            print('\tTraining Early Stopped')
            break
            
    scheduler.step()
    print('\tVal Loss : %.3f \t Val Accuracy : %.3f' %(val_loss, val_acc))



Epoch [0] : [------------------------------------------------->] 379/379	Val Loss : 6.357 	 Val Accuracy : 0.075
Epoch [1] : [------------------------------------------------->] 379/379	Val Loss : 6.127 	 Val Accuracy : 0.086
Epoch [2] : [------------------------------------------------->] 379/379	Val Loss : 5.873 	 Val Accuracy : 0.079
Epoch [3] : [------------------------------------------------->] 379/379	Val Loss : 5.648 	 Val Accuracy : 0.070
Epoch [4] : [------------------------------------------------->] 379/379	Val Loss : 5.433 	 Val Accuracy : 0.098
Epoch [5] : [------------------------------------------------->] 379/379	Val Loss : 5.330 	 Val Accuracy : 0.108
Epoch [6] : [------------------------------------------------->] 379/379	Val Loss : 5.225 	 Val Accuracy : 0.110
Epoch [7] : [------------------------------------------------->] 379/379	Val Loss : 5.150 	 Val Accuracy : 0.112
Epoch [8] : [------------------------------------------------->] 379/379	Val Loss : 5.105 	 Val 

In [44]:
em_weight = w2v_model.embedding.weight
o_weight = w2v_model.o_layer.weight.view(-1,window_size-1,em_dim)
o_weight = torch.mean(o_weight, dim=1)
en_weight = (em_weight + o_weight)/2

o_bias = w2v_model.o_layer.bias.view(-1,window_size-1)
o_bias = torch.mean(o_bias, dim=1)

In [46]:
en_bias = o_bias.detach().cpu().numpy()
en_weight = en_weight.detach().cpu().numpy() 
en_weight[0] = 0.0

np.save('./Embedding/array/en_weight.npy', en_weight)
np.save('./Embedding/array/en_bias.npy', en_bias)