# word2vec (Cbow)

In [1]:
# 环境配置
%cd /playground/sgd_deep_learning/sgd_nlp/
import sys 
sys.path.append('./python')

/playground/sgd_deep_learning/sgd_nlp


In [2]:
from sgd_nlp.embedding import Cbow, CorpusFactoryCbow

from torch.optim.lr_scheduler import ExponentialLR
import torch
import os
import time
import pickle

## 语料处理

In [3]:
# 获取语料数据
def load_corpus(corpus_dir_path, load_obj=False, obj_file_name=None):
    
    if load_obj and os.path.isfile(obj_file_name):
        with open(obj_file_name, 'rb') as fin:
            print("!!! load corpus factory success !!!")
            return pickle.load(fin)
    else:
        print('CURRENT PATH:\t', corpus_dir_path)

        corpus_factory = CorpusFactoryCbow(corpus_dir_path)  # new obj from origin corpus file path
        corpus_factory.vocab.log_info()
        with open(obj_file_name, 'wb') as fout:
            pickle.dump(corpus_factory, fout)
        return corpus_factory

## Training Loop

In [4]:
def train(corpus_factory, model, optimizer, scheduler, config):
    all_words_num = corpus_factory.vocab.corpus_word_count  # 文档中的总词数
    epoch = int(config.corpus_run_loop * all_words_num / config.batch_size)  # 总共需要迭代几个epoch
    global_min_loss = 1e6

    # training loop
    for i in range(epoch):
        t1 = time.time()
        optimizer.zero_grad()
        # forward
        batch_data = corpus_factory.training_batch(batch_num=config.batch_size,
                                                   device=config.device,
                                                   win_width=config.win_width,
                                                   neg_k=config.neg_k)
        y = model.forward(batch_data)

        # objective function (loss function)
        j_theta = torch.sum(y, dim=[1, 2]).mean()  # maximize objective
        nj_theta = -1 * j_theta  # minimize objective

        # backward and update weight
        nj_theta.backward()
        optimizer.step()

        if epoch % config.scheduler_step == 0:
            scheduler.step()

        # output info
        tmp_t = time.time() - t1
        # avg_time = avg_time * 0.9 + 0.1 * tmp_t
        if i % config.log_step == 0:
            print('epoch:{}/{}, loss:{}, cost_time: {}'.format(i, epoch, nj_theta, tmp_t))

        # save best model
        if nj_theta < global_min_loss:
            global_min_loss = nj_theta
            torch.save(model.state_dict(), config.model_weights_obj_path)
            print('new bset loss: {}'.format(nj_theta))

### 训练参数设置

In [11]:
class config:
    # 文本语料路径
    data_home = r'./data'
    sub_dir = r'friends/season10'
    corpus_dir_path = os.path.join(data_home, sub_dir)
    
    # 直接加载对象
    SAVE_HOME = r'./apps/embedding/save/'
    model_name = r'cbow'
    
    load_corpus_obj = True # 训练前修改！
    corpus_obj_path = os.path.join(SAVE_HOME, model_name, r'corpus_obj.cf') # 加载预处理语料  default:None
     
    load_model_weight_obj = True # 训练前修改！
    model_weights_obj_path = os.path.join(SAVE_HOME, model_name, r'cbow_weights.path') # 加载预训练模型参数 default:None
    
    # 语料预处理参数
    win_width = 11  # context 窗口大小（前5-中间词-后5）
    neg_k = 10  # 负采样数
    
    # 模型参数
    device = torch.device('cuda')
    emb_dim = 300
    
    # 训练参数
    lr = 1e-2 # 初始学习率
    corpus_run_loop = 2  # 看n遍文本
    batch_size = 1024   # 每个batch的大小
    
    scheduler_step = 20
    log_step = 50

### main

In [9]:
def app():
    # class obj
    corpus_factory = load_corpus(corpus_dir_path=config.corpus_dir_path,
                                 load_obj=config.load_corpus_obj,
                                 obj_file_name=config.corpus_obj_path)
    
    model = Cbow(emb_dim=config.emb_dim,
                 token_num=corpus_factory.token_num(),
                 win_width=config.win_width,
                 sparse_emb=True).to(config.device)

    # load weight
    if config.load_model_weight_obj and os.path.isfile(config.model_weights_obj_path):
        model.load_state_dict(torch.load(config.model_weights_obj_path))
        print("!!! Load model weights success !!!")

    # optimizer
    optimizer = torch.optim.SparseAdam(params=model.parameters(), lr=config.lr)
    scheduler = ExponentialLR(optimizer, gamma=0.9)

    train(corpus_factory=corpus_factory,
          model=model,
          optimizer=optimizer,
          scheduler=scheduler,
          config=config,
          )

In [13]:
app()

!!! load corpus factory success !!!
!!! Load model weights success !!!
epoch:0/1732, loss:2.555410623550415, cost_time: 0.03940176963806152
new bset loss: 2.555410623550415
new bset loss: 1.6718511581420898
new bset loss: 1.6290907859802246
new bset loss: 1.6108152866363525
new bset loss: 1.5397448539733887
epoch:50/1732, loss:2.9481301307678223, cost_time: 0.03519773483276367
new bset loss: 1.5265324115753174
new bset loss: 1.30142343044281
new bset loss: 1.1666502952575684
new bset loss: 1.0776259899139404
epoch:100/1732, loss:2.0683443546295166, cost_time: 0.03653311729431152
epoch:150/1732, loss:1.9276400804519653, cost_time: 0.0360560417175293
epoch:200/1732, loss:1.3475074768066406, cost_time: 0.03653550148010254
epoch:250/1732, loss:2.508596897125244, cost_time: 0.034174203872680664
epoch:300/1732, loss:2.3538296222686768, cost_time: 0.03573465347290039
epoch:350/1732, loss:2.806239366531372, cost_time: 0.03628849983215332
epoch:400/1732, loss:5.390937805175781, cost_time: 0.035