In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
from torch.utils.data import Dataset, DataLoader , Subset, random_split

In [3]:
from torch.utils.tensorboard import SummaryWriter

In [4]:
import sys
import numpy as np
import pandas as pd
import random
import collections

In [5]:
import re

In [6]:
import import_ipynb
import preprocessor

importing Jupyter notebook from preprocessor.ipynb


In [7]:
from nltk.tokenize import word_tokenize

In [8]:
from tqdm import tqdm

## Data

In [9]:
dir_path = '../../../Data/'
data = pd.read_excel(dir_path + '한국어_대화체_번역.xlsx' , engine='openpyxl')

In [10]:
data[['원문','번역문']].head()

Unnamed: 0,원문,번역문
0,이번 신제품 출시에 대한 시장의 반응은 어떤가요?,How is the market's reaction to the newly rele...
1,판매량이 지난번 제품보다 빠르게 늘고 있습니다.,The sales increase is faster than the previous...
2,그렇다면 공장에 연락해서 주문량을 더 늘려야겠네요.,"Then, we'll have to call the manufacturer and ..."
3,"네, 제가 연락해서 주문량을 2배로 늘리겠습니다.","Sure, I'll make a call and double the volume o..."
4,지난 회의 마지막에 논의했던 안건을 다시 볼까요?,Shall we take a look at the issues we discusse...


In [11]:
en_data = data['번역문']

In [12]:
vs = preprocessor.VocabSet(word_tokenize)
en_tokens = vs.tokens(en_data)

In [13]:
en_encoder = preprocessor.Encoder(en_data , word_tokenize , en_tokens)
en_encoded = en_encoder.encode()

## Context Data

In [14]:
con_size = 11

en_word2vec = []

In [15]:
for i , sen in enumerate(tqdm(en_encoded)) :
    
    if len(sen) < con_size :
        continue
        
    for j in range(len(sen) - con_size + 1) :
        en_con = sen[j:j+con_size]
        en_word2vec.append(en_con)
        
en_wordvec = random.shuffle(en_word2vec)

100%|██████████| 100000/100000 [00:01<00:00, 74237.27it/s]


In [16]:
en_word2vec = np.array(en_word2vec)

en_cen = en_word2vec[:,int(con_size/2)]
en_neigh = np.hstack([en_word2vec[:,:int(con_size/2)],en_word2vec[:,int(con_size/2)+1:]])

In [17]:
print('Data Shape \n')
print('Center Shape : {}'.format(en_cen.shape))
print('Neighbor Shape : {}'.format(en_neigh.shape))

Data Shape 

Center Shape : (704707,)
Neighbor Shape : (704707, 10)


## Device

In [18]:
USE_CUDA = torch.cuda.is_available()
random.seed(20210906)
torch.cuda.manual_seed_all(20210906)
device = torch.device("cuda" if USE_CUDA else "cpu") 

## DataSet

In [19]:
class Word2VecDataset(Dataset) :

    def __init__(self , center , neighbor , val_ratio = 0.9) :

        super(Word2VecDataset , self).__init__()
        
        self.center = center
        self.neighbor = neighbor
        self.val_ratio = val_ratio

    def __len__(self) :

        return len(self.center)

    def __getitem__(self , idx) :

        cen_idx = self.center[idx]
        neigh_idx = self.neighbor[idx]
        
        return cen_idx , neigh_idx
    
    def split_dataset(self) :

        n_val = int(len(self) * self.val_ratio)
        n_train = len(self) - n_val
        train_set, val_set = random_split(self, [n_train, n_val])
        return train_set, val_set

In [20]:
dataset = Word2VecDataset(en_cen , en_neigh)

train_data , val_data = dataset.split_dataset()

In [21]:
batch_size = 256

train_loader = DataLoader(train_data ,
                          batch_size = batch_size,
                          num_workers = 4,
                          shuffle = True,
                          drop_last = True)

test_loader = DataLoader(val_data ,
                         batch_size = batch_size,
                         num_workers = 4,
                         shuffle = False,
                         drop_last = True)

## Model Parameter

In [22]:
h_size = 512

v_size = en_encoder.get_size()

In [23]:
class SkipGram(nn.Module) :
    
    def __init__(self , v_size , h_size , con_size) :
        
        super(SkipGram , self).__init__()
        self.v_size = v_size
        self.h_size = h_size
        self.con_size = con_size
        
        self.em = nn.Embedding(num_embeddings=v_size, 
                               embedding_dim=h_size,
                               padding_idx=0)
        self.out = nn.Linear(h_size, v_size*(con_size-1))
        
    def init_param(self) :        
        nn.init.random_normal_(self.em.weight , mean=0.0, std=0.1)

        nn.init.xavier_normal_(self.out.weight)
        nn.init.zeros_(self.out.bias)

    def get_em(self) :
        em_weight = self.em.weight.detach().cpu().numpy()
        em_weight[0] = 0.0
        
        return em_weight
        
    def forward(self , in_tensor) :
        in_tensor = in_tensor.unsqueeze(1)
        em_tensor = self.em(in_tensor)
        o_tensor = self.out(em_tensor)
        o_tensor = torch.reshape(o_tensor, [-1,(self.con_size-1),self.v_size])
        p_tensor = F.softmax(o_tensor, dim=-1)
       
        return p_tensor
        

## Training

In [33]:
epoch_size = 20
min_loss = 1e+7
init_lr = 0.025
early_count = 0
log_count = 0

skipgram = SkipGram(v_size , h_size , con_size).to(device)
optimizer = optim.SGD(skipgram.parameters(), lr = init_lr , momentum=0.9)

In [34]:
def schedule_fn(epoch , lr) :
    
    decay_lr = lr * (epoch / epoch_size)
    
    return (lr - decay_lr) / lr

In [35]:
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch : schedule_fn(epoch, init_lr))

In [36]:
writer = SummaryWriter('runs/skipgram/')

In [37]:
def acc_fn(y_output , y_label) :

    y_arg = torch.argmax(y_output, dim=-1)
    y_acc = (y_arg == y_label).float()

    y_acc = torch.mean(y_acc , dim=-1)
    y_acc = torch.mean(y_acc)

    return y_acc

In [38]:
def loss_fn(y_output , y_label) :

    y_label = y_label.unsqueeze(2)
    y_prob = torch.gather(y_output, -1, y_label)
    
    y_loss = -torch.log(y_prob+1e-20)

    y_loss = torch.mean(y_loss , dim=-1)
    y_loss = torch.mean(y_loss)

    return y_loss

In [39]:
def progressLearning(value, endvalue, loss , acc , bar_length=50):
      
    percent = float(value + 1) / endvalue
    arrow = '-' * int(round(percent * bar_length)-1) + '>'
    spaces = ' ' * (bar_length - len(arrow))

    sys.stdout.write("\rPercent: [{0}] {1}/{2} \t Loss : {3:.3f} , Acc : {4:.3f}".format(arrow + spaces, value+1 , endvalue , loss , acc))
    sys.stdout.flush()

In [40]:
def evaluate(model , test_loader , device) :

    loss = 0.0
    acc = 0.0

    with torch.no_grad() :

        model.eval()

        for cen_data , neigh_data in test_loader :
     
            cen = cen_data.to(device)    
            neigh = neigh_data.long().to(device)

            output = model(cen)

            loss_idx = loss_fn(output , neigh)
            acc_idx = acc_fn(output , neigh)

            loss += loss_idx
            acc += acc_idx

        model.train()

    loss /= len(test_loader)
    acc /= len(test_loader)

    return loss , acc

In [41]:
for epoch in range(epoch_size) :

    idx = 0
    print('Epoch : %d , Learning Rate : %e' %(epoch , optimizer.param_groups[0]['lr']))
    
    for cen_data , neigh_data in train_loader : 

        cen = cen_data.to(device)    
        neigh = neigh_data.long().to(device)
        
        optimizer.zero_grad()
        
        output = skipgram(cen) 
        
        loss = loss_fn(output , neigh)
        acc = acc_fn(output , neigh)
        
        loss.backward()
        optimizer.step()
        
        progressLearning(idx , len(train_loader) , loss.item() , acc.item())

        if (idx + 1) % 10 == 0 :
            
            writer.add_scalar('train/loss' , loss.item() , log_count)
            writer.add_scalar('train/acc' , acc.item() , log_count)
            log_count += 1

        idx += 1 

    test_loss, test_acc = evaluate(skipgram, test_loader, device) 
    
    writer.add_scalar('test/loss' , test_loss.item() , epoch)
    writer.add_scalar('test/acc' , test_acc.item() , epoch)
    
    if test_loss < min_loss :
        min_loss = test_loss
        torch.save({'epoch' : (epoch) ,  
                    'model_state_dict' : skipgram.state_dict() , 
                    'loss' : test_loss.item() , 
                    'acc' : test_acc.item()} , 
                    f'./Model/checkpoint_skipgram.pt')        
        early_count = 0 
        
    else :
        early_count += 1
        if early_count >= 5 :      
            print('\nTraining Stopped')
            break

    scheduler.step()
    print('\nValidation Loss : %.4f \t Validation Acc : %.4f\n' %(test_loss , test_acc))



Epoch : 0 , Learning Rate : 2.500000e-02
Percent: [------------------------------------------------->] 275/275 	 Loss : 8.315 , Acc : 0.066
Validation Loss : 8.2008 	 Validation Acc : 0.0656

Epoch : 1 , Learning Rate : 2.375000e-02
Percent: [------------------------------------------------->] 275/275 	 Loss : 7.868 , Acc : 0.077
Validation Loss : 7.7996 	 Validation Acc : 0.0828

Epoch : 2 , Learning Rate : 2.250000e-02
Percent: [------------------------------------------------->] 275/275 	 Loss : 7.458 , Acc : 0.100
Validation Loss : 7.5545 	 Validation Acc : 0.0910

Epoch : 3 , Learning Rate : 2.125000e-02
Percent: [------------------------------------------------->] 275/275 	 Loss : 7.240 , Acc : 0.098
Validation Loss : 7.3822 	 Validation Acc : 0.0958

Epoch : 4 , Learning Rate : 2.000000e-02
Percent: [------------------------------------------------->] 275/275 	 Loss : 7.072 , Acc : 0.115
Validation Loss : 7.2528 	 Validation Acc : 0.0989

Epoch : 5 , Learning Rate : 1.875000e-02