# word2vec (skip gram)

In [16]:
# 环境配置
%cd /playground/sgd_deep_learning/sgd_nlp/
import sys 
sys.path.append('./python')

/playground/sgd_deep_learning/sgd_nlp


In [17]:
from sgd_nlp.embedding import SkipGram, CorpusFactorySkipGram

from torch.optim.lr_scheduler import ExponentialLR
import torch
import os
import time
import pickle

## 语料处理

In [18]:
# 获取语料数据
def load_corpus(corpus_dir_path, load_obj=False, obj_file_name=None):
    
    if load_obj and os.path.isfile(obj_file_name):
        with open(obj_file_name, 'rb') as fin:
            print("!!! load corpus factory success !!!")
            return pickle.load(fin)
    else:
        print('CURRENT PATH:\t', corpus_dir_path)

        corpus_factory = CorpusFactorySkipGram(corpus_dir_path)  # new obj from origin corpus file path
        corpus_factory.vocab.log_info()
        with open(obj_file_name, 'wb') as fout:
            pickle.dump(corpus_factory, fout)
        return corpus_factory

## Training Loop

In [19]:
def train(corpus_factory, model, optimizer, scheduler, config):
    all_words_num = corpus_factory.vocab.corpus_word_count  # 文档中的总词数
    epoch = int(config.corpus_run_loop * all_words_num / config.batch_size)  # 总共需要迭代几个epoch
    global_min_loss = 1e6

    # training loop
    for i in range(epoch):
        t1 = time.time()
        optimizer.zero_grad()
        # forward
        batch_data = corpus_factory.training_batch(batch_num=config.batch_size,
                                                   device=config.device,
                                                   win_width=config.win_width,
                                                   neg_k=config.neg_k)
        y = model.forward(batch_data)

        # objective function (loss function)
        j_theta = torch.sum(y, dim=[1, 2]).mean()  # maximize objective
        nj_theta = -1 * j_theta  # minimize objective

        # backward and update weight
        nj_theta.backward()
        optimizer.step()

        if epoch % config.scheduler_step == 0:
            scheduler.step()

        # output info
        tmp_t = time.time() - t1
        # avg_time = avg_time * 0.9 + 0.1 * tmp_t
        if i % config.log_step == 0:
            print('epoch:{}/{}, loss:{}, cost_time: {}'.format(i, epoch, nj_theta, tmp_t))

        # save best model
        if nj_theta < global_min_loss:
            global_min_loss = nj_theta
            torch.save(model.state_dict(), config.model_weights_obj_path)
            print('new bset loss: {}'.format(nj_theta))

### 训练参数设置

In [20]:
class config:
    # 文本语料路径
    data_home = r'./data'
    sub_dir = r'friends/season10'
    corpus_dir_path = os.path.join(data_home, sub_dir)
    
    # 直接加载对象
    SAVE_HOME = r'./apps/embedding/save/'
    model_name = r'skipgram'
    
    load_corpus_obj = True # 训练前修改！
    corpus_obj_path = os.path.join(SAVE_HOME, model_name, r'corpus_obj.cf') # 加载预处理语料  default:None
     
    load_model_weight_obj = True # 训练前修改！
    model_weights_obj_path = os.path.join(SAVE_HOME, model_name, r'skipgram_weights.path') # 加载预训练模型参数 default:None
    
    # 语料预处理参数
    win_width = 11  # context 窗口大小（前5-中间词-后5）
    neg_k = 10  # 负采样数
    
    # 模型参数
    device = torch.device('cuda')
    emb_dim = 300
    
    # 训练参数
    lr = 1e-1 # 初始学习率
    corpus_run_loop = 2  # 看n遍文本
    batch_size = 2048   # 每个batch的大小
    scheduler_step = 20
    log_step = 50
    


### main

In [21]:
def app():
    # class obj
    corpus_factory = load_corpus(corpus_dir_path=config.corpus_dir_path,
                                 load_obj=config.load_corpus_obj,
                                 obj_file_name=config.corpus_obj_path)
    
    model = SkipGram(emb_dim = config.emb_dim,
                     token_num = corpus_factory.token_num(),
                     sparse_emb = True).to(config.device)

    # load weight
    if config.load_model_weight_obj and os.path.isfile(config.model_weights_obj_path):
        model.load_state_dict(torch.load(config.model_weights_obj_path))
        print("!!! Load model weights success !!!")

    # optimizer
    optimizer = torch.optim.SparseAdam(params=model.parameters(), lr=config.lr)
    scheduler = ExponentialLR(optimizer, gamma=0.9)

    train(corpus_factory=corpus_factory,
          model=model,
          optimizer=optimizer,
          scheduler=scheduler,
          config=config,
          )

In [22]:
app()

!!! load corpus factory success !!!
!!! Load model weights success !!!
epoch:0/866, loss:0.3701174259185791, cost_time: 0.1502513885498047
new bset loss: 0.3701174259185791
new bset loss: 0.2875667214393616
new bset loss: 0.0001586870930623263
epoch:100/866, loss:19.208141326904297, cost_time: 0.06616735458374023
new bset loss: 4.4968953716306714e-07
new bset loss: 2.46299508566139e-13
epoch:200/866, loss:0.32338595390319824, cost_time: 0.07891416549682617
new bset loss: 2.4970805909020142e-17
epoch:300/866, loss:10.71897029876709, cost_time: 0.06830739974975586
epoch:400/866, loss:0.0521174818277359, cost_time: 0.06322121620178223
epoch:500/866, loss:2.894324779510498, cost_time: 0.06169605255126953
new bset loss: 1.3585918309745046e-24
epoch:600/866, loss:6.7027587890625, cost_time: 0.07483291625976562
epoch:700/866, loss:4.0759196281433105, cost_time: 0.08409428596496582
epoch:800/866, loss:6.246488094329834, cost_time: 0.06651139259338379


## playground

In [23]:
import random
res = random.choices([0,1,2,3], weights=[1,1,3,1], k=10)
print(type(res))

<class 'list'>
