In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

from pdb import set_trace
import random
import math
import os
import time

In [2]:
SEED = 666666
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

def tokenize_py(text):
    """
    Tokenizes py text from a string into a list of strings
    """
    #print(text)
    return text.split(' ')
def tokenize_ch(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    t = list(text)#@.split('')
    #print(t)
    return t #list(text)
SRC = Field(tokenize=tokenize_py, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True)
TRG = Field(tokenize=tokenize_ch, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True)

In [5]:
train_data = TranslationDataset('./data/ai_shell_train',('.py','.han'),(SRC,TRG))
valid_data = TranslationDataset('./data/ai_shell_dev',('.py','.han'),(SRC,TRG))
test_data = TranslationDataset('./data/ai_shell_test',('.py','.han'),(SRC,TRG))
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)


#train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
BATCH_SIZE = 32
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE,
     device=device)


In [31]:

class Model(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim,n_layers):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers,
                           dropout = 0,bidirectional=True)
        
        self.fc = nn.Linear(hid_dim*2+emb_dim, 512)
        self.conv1 = nn.Conv2d(1,1,(7,3),padding=(3,1))
        self.relu1 = nn.LeakyReLU()
        self.bn1 = nn.BatchNorm2d(1)
        self.conv2 = nn.Conv2d(1,1,(7,3),padding=(3,1))
        self.relu2 = nn.LeakyReLU()
        self.bn2 = nn.BatchNorm2d(1)
        
        self.fc2 = nn.Linear(512, output_dim)
        #self.relu = nn.LeakyReLU()
        #s#elf.softmax = nn.Softmax(dim=-1)
        self.log_softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, src):
        
        #src = [src sent len, batch size]
        
        embedded = self.embedding(src)
        max_len = src.shape[0]
        #embedded = [src sent len, batch size, emb dim]
       
        outputs, (hidden, cell) = self.rnn(embedded)
        out2 = torch.cat([outputs,embedded],-1)
        #out3  = self.fc(out2)
        out  = self.fc(out2)
        out = torch.unsqueeze(out,1)
       
        out = self.conv1(out)
        out = self.relu1(out)
        out = self.bn1(out)
        
        out = self.conv2(out)
        out = self.relu2(out)
        out = self.bn2(out)       
        
        out = torch.squeeze(out)
        out = self.fc2(out)
        out = self.log_softmax(out)
       
        
        return out#, hidden, cell
    

In [33]:
py_vocab_size = len(SRC.vocab)
ch_vocab_size = len(TRG.vocab)
emb_dim = 512
#DEC_EMB_DIM = 256
hidden_dim = 512
n_layers = 2

model = Model(py_vocab_size, emb_dim, hidden_dim,ch_vocab_size, n_layers).to(device)
#dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

#model = Seq2Seq(enc, dec, device).to(device)

In [34]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model = model.apply(init_weights)

In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,798,087 trainable parameters


In [17]:
import numpy as np

In [18]:
optimizer = optim.Adam(model.parameters(),lr=3e-4)

In [19]:
PAD_IDX = TRG.vocab.stoi['<pad>']

criterion = nn.NLLLoss(ignore_index = PAD_IDX)
import tqdm

tok = ['<eos>','<unk>','<sos>','<pad>']

In [20]:
def compare_pre_target(output,target,show_txt=True):
    pred = torch.argmax(output,-1)
    i = 0
    for p, t in zip(pred.cpu().numpy(),target.cpu().numpy()):    
       
        ss = [TRG.vocab.itos[i] for i in p]
        ss = [_s for _s in ss if _s not in tok]
        s_text = ''.join(ss)
        
        tt = [TRG.vocab.itos[i] for i in t]
        tt = [_t for _t in tt if _t not in tok]
        
        t_text = ''.join(tt)
        
        if i ==0 and show_txt:
            print('pred:',s_text[:len(t_text)])    
            print('true:',t_text)
        i+=1
        if len(ss) !=0:
            acc = np.sum([s==t for s,t in zip(ss,tt)])/(len(tt))
        else:
            acc = 0
        
        return acc
        



In [29]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    print('evaluating ....')
    val_acc = 0
    pbar = tqdm.tqdm_notebook(total=len(iterator))
    with torch.no_grad():
        
        for i, batch in enumerate(iterator):
            pbar.update(1)
            src = batch.src
            trg = batch.trg

            output = model(src)
            acc = compare_pre_target(output.detach(),trg.detach(),i % 64==0)
            
            #output = [batch size, trg sent len - 1, output dim]
            #trg = [batch size, trg sent len]
            #if i %32 ==0:
               # compare_pre_target(output,trg)
            output = output.contiguous().view(-1, output.shape[-1])
            trg = trg.contiguous().view(-1)#[:,1:]
            
            val_acc = (val_acc*i + acc)/(i+1)
            msg = 'val acc: {:.3}'.format(val_acc)
            #output = [batch size * trg sent len - 1, output dim]
            #trg = [batch size * trg sent len - 1]
            
            pbar.set_description_str(msg)
    print('done')    
    return val_acc

val_acc = evaluate(model, valid_iterator, criterion)

evaluating ....


HBox(children=(IntProgress(value=0, max=885), HTML(value='')))

pred: 并称今天
true: 并称今天
pred: 年龄或大获小的孩子
true: 年龄或大或小的孩子
pred: 一位商场场内部人士称
true: 一位商场场内部人士称
pred: 而不是单独的一家来垄断
true: 而不是单独的一家来垄断
pred: 京华时报讯记者张然昨天
true: 京华时报讯记者张然昨天
pred: 房贷的发放需要一定的周期
true: 房贷的发放需要一定的周期
pred: 这次毕竟是在家门口备战比赛
true: 这次毕竟是在家门口备战比赛
pred: 向他确认了这是一款工程测试机
true: 向他确认了这是一款工程测试机
pred: 警方认定他的死是被李某殴打所致
true: 警方认定他的死是被李某殴打所致
pred: 孙河板块现在共有四个别墅项目在售
true: 孙河板块现在共有四个别墅项目在售
pred: 合作包括你使用一亿元向唐人影视增资
true: 合作包括拟使用一亿元向唐人影视增资
pred: 同时都都宝城市一卡通网上充付平台上线
true: 同时都都宝城市一卡通网上充付平台上线
pred: 这名运动员曾获得二零零零年悉尼奥运会金牌
true: 这名运动员曾获得二零零零年悉尼奥运会金牌
pred: 把住宅交给万科万科聚交三好住宅和城市配套服务
true: 把住宅交给万科万科聚焦三好住宅和城市配套服务
done


In [24]:
n_epoch = 100
grad_clip = 1.0
SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'py_to_han')

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

In [30]:
for epoch in range(n_epoch):
    
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    bar = tqdm.tqdm_notebook(total=len(train_iterator))
    for i, batch in enumerate(train_iterator):
        bar.update(1)
        src = batch.src
        trg = batch.trg
        if trg.shape[0] ==0:
            continue
        optimizer.zero_grad()
        
        output = model(src)
        
        #if i %1024==0:
        show_txt=(i%128==0)
        acc = compare_pre_target(output.detach(),trg.detach(),show_txt)
            
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        
        output = output[:].view(-1, output.shape[-1])
        trg = trg[:].view(-1)
        
        if trg.shape[0] ==0:
            continue
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        loss = criterion(output, trg)
        
        loss.backward()
        
        #torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        
        optimizer.step()
        
        epoch_loss = (epoch_loss*i + loss.item())/(i+1)
        epoch_acc = (epoch_acc*i + acc)/(i+1)
        
        msg = 'loss:{:.5},acc:{:.5}'.format(epoch_loss,epoch_acc)
        bar.set_description_str(msg)
        
  #  train_loss = epoch_loss
    val_acc = evaluate(model, train_iterator, criterion)
    
    
    optimizer.param_groups[0]['lr'] *= 0.95
    print('lr:',optimizer.param_groups[0]['lr'])
    if val_acc > best_val_acc:
        model_path = MODEL_SAVE_PATH + 'val_acc{:.3}.pth'.format(val_acc)
        
        print('validation acc increased from {} to {},saving model to {}'.format(best_val_acc,val_acc,
              model_path))
        
        best_val_acc = val_acc
       # torch.save(model.state_dict(), MODEL_SAVE_PATH)
    


HBox(children=(IntProgress(value=0, max=3098), HTML(value='')))

pred: 逆向车辆行驶方向跑
true: 逆向车辆行驶方向跑
pred: 经济参考报记者近日在多地走访了解到
true: 经济参考报记者近日在多地走访了解到
pred: 苹果和三星这对冤家
true: 苹果和三星这对冤家
pred: 在基金托管职责履行内控制度建设方面
true: 在基金托管职责履行内控制度建设方面
pred: 一月公寓豪宅共成交三百套
true: 一月公寓豪宅共成交三百套
pred: 学校教室确实存在甲醛和氨不同程度超标的情况
true: 学校教室确实存在甲醛和氨不同程度超标的情况
pred: 由于绿地还未整体上市
true: 由于绿地还未整体上市
pred: 转而做生意岂料杀出新血路
true: 转而做生意岂料杀出新血路
pred: 而今天这么多带有类似生活理念的人齐聚一堂
true: 而今天这么多带有类似生活理念的人齐聚一堂


KeyboardInterrupt: 