In [1]:
import json
from tqdm import tqdm
import torch
with open('w2id++.json', 'r') as f:
    w2id = json.load(f)
with open('id2w++.json', 'r') as f:
    id2w = json.load(f)
    
data_list = []
with open('data_splited++.jl', 'r') as f:
    for l in f:
        data_list.append(json.loads(l))
embedding = []
with open('embedding++.jl', 'r') as f:
    for l in f:
        embedding.append(json.loads(l))
        
batch_size = 32
data_workers = 4
learning_rate = 0.01
gradient_accumulation_steps = 1
max_train_epochs = 60
warmup_proportion = 0.05
weight_decay=0.01
max_grad_norm=1.0 

device = torch.device('cuda')

In [2]:
dlx = [[] for _ in range(5)]
for d in data_list:
    dlx[len(d[0]) - 5].append(d)

In [3]:
import torch
class MyDataSet(torch.utils.data.Dataset):
    def __init__(self, examples):
        self.examples = examples
    def __len__(self):
        return len(self.examples)
    def __getitem__(self, index):
        example = self.examples[index]
        s1 = example[0]
        s2 = example[1]
        return s1, s2, index
def str2id(s):
    ids = []
    for ch in s:
        if ch in w2id:
            ids.append(w2id[ch])
        else:
            ids.append(0)
    return ids
def the_collate_fn(batch):
    s1x = []
    s2x = []
    for b in batch:
        s1 = str2id(b[0])
        s2 = str2id(b[1])
        s1x.append(s1)
        s2x.append(s2)
    indexs = [b[2] for b in batch]
    s1 = torch.LongTensor(s1x)
    s2 = torch.LongTensor(s2x)
    return s1, s2, indexs



dldx = []
for d in dlx:
    ds = MyDataSet(d)
    dld = torch.utils.data.DataLoader(
        ds,
        batch_size=batch_size,
        shuffle = True,
        num_workers=data_workers,
        collate_fn=the_collate_fn,
    )
    dldx.append(dld)

In [4]:
import torch.nn as nn
import torch.nn.functional as F
class LSTMModel(nn.Module):
    def __init__(self, device, word_size, embedding_dim=256, hidden_dim=256):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.device = device
        self.embedding = nn.Embedding(word_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=4, bidirectional=True, batch_first=True)
        self.out = nn.Linear(hidden_dim*2, word_size)
        
    def forward(self, s1, s2=None):
        batch_size, length = s1.shape[:2]
        b = self.embedding(s1)
        l = self.lstm(b)[0]
        r = self.out(l)
        r = F.log_softmax(r, dim=1)
        if s2 is not None:
            criterion = nn.NLLLoss()
            loss = criterion(r.view(batch_size*length, -1), s2.view(batch_size*length))
            return loss
        return r

In [5]:
model = LSTMModel(device, len(w2id), 300)
model.to(device)

LSTMModel(
  (embedding): Embedding(5629, 300)
  (lstm): LSTM(300, 256, num_layers=4, batch_first=True, bidirectional=True)
  (out): Linear(in_features=512, out_features=5629, bias=True)
)

In [6]:
import numpy as np
pretrained_weight = np.array(embedding)
model.embedding.weight.data.copy_(torch.from_numpy(pretrained_weight))

tensor([[-0.2564, -0.4880, -0.4565,  ..., -0.0330, -0.4634,  0.1318],
        [ 0.4903,  0.3327, -0.3386,  ...,  0.0285, -0.9027,  0.0091],
        [-0.1047, -0.1974,  0.3769,  ...,  0.4282, -0.0082,  0.3296],
        ...,
        [ 0.3012,  0.8523, -0.5348,  ..., -0.4911,  0.0212, -0.6289],
        [ 0.2072, -0.1480,  0.0041,  ...,  0.3521, -0.0441, -0.1644],
        [ 0.3139,  0.0951, -0.3184,  ..., -0.2843, -0.2507, -0.1536]],
       device='cuda:0')

In [7]:
def t2s(t):
    l = t.cpu().tolist()
    r = [id2w[x] for x in l[0]]
    return ''.join(r)

def get_next(s):
    ids = torch.LongTensor(str2id(s))
    print(s)
    ids = ids.unsqueeze(0).to(device)
    with torch.no_grad():
        r = model(ids)
        r = r.argmax(dim=2)
        return t2s(r)
def print_cases():
    print(get_next('好好学习') + '\n')
    print(get_next('白日依山尽') + '\n')
    print(get_next('学而时习之') + '\n')
    print(get_next('人之初性本善') + '\n')

In [8]:
from transformers import AdamW, get_linear_schedule_with_warmup


t_total = len(data_list) // gradient_accumulation_steps * max_train_epochs + 1
num_warmup_steps = int(warmup_proportion * t_total)

print('warmup steps : %d' % num_warmup_steps)

no_decay = ['bias', 'LayerNorm.weight'] # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [
    {'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': weight_decay},
    {'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)


warmup steps : 3350217


In [9]:
loss_list = []
for e in range(max_train_epochs):
    print(e)
    loss_sum = 0
    c = 0
    xxx = [x.__iter__() for x in dldx]
    j = 0
    for i in tqdm(range((len(data_list)//batch_size) + 5)):
        if len(xxx) == 0:
            break
        j = j % len(xxx)
        try:
            batch = xxx[j].__next__()
        except StopIteration:
            xxx.pop(j)
            continue
        j += 1
        s1, s2, index = batch
        s1 = s1.to(device)
        s2 = s2.to(device)
        loss = model(s1, s2)
        loss_sum += loss.item()
        c += 1
        loss.backward()
        optimizer.step()
        scheduler.step() 
        optimizer.zero_grad()
    print_cases()
    print(loss_sum / c)
    loss_list.append(loss_sum / c)

0


100%|██████████| 34903/34903 [10:27<00:00, 55.62it/s]

好好学习
虽蹬嵯湄

白日依山尽
翛帆慰穗痊

学而时习之
譬蹬恣嵯湄

人之初性本善
譬蹬聊兮逡湄

1.5098877490145788
1



100%|██████████| 34903/34903 [11:44<00:00, 49.54it/s]

好好学习
譬啻嵯湄

白日依山尽
朱蕖冒屿赊

学而时习之
譬啻恣嵯湄

人之初性本善
譬者仅兮嵯湄

1.283264813827865
2



100%|██████████| 34903/34903 [11:29<00:00, 50.60it/s]

好好学习
蹭啻嵯湄

白日依山尽
沧柑冒屿赊

学而时习之
蹭蹬恣嵯湄

人之初性本善
譬者冻恣嵯湄

1.223697599587514
3



100%|██████████| 34903/34903 [11:48<00:00, 49.28it/s]

好好学习
孰啻嵯湄

白日依山尽
沧鹂掠屿赊

学而时习之
譬啻恣嵯湄

人之初性本善
譬蹬协恣嵯湄

1.183729660144156
4



100%|██████████| 34903/34903 [11:28<00:00, 50.68it/s]


好好学习
恍啻嵯湄

白日依山尽
朱鹂冒屿赊

学而时习之
譬啻恣嵯湄

人之初性本善
譬者媲心嵯湄

1.1583524771486675
5


100%|██████████| 34903/34903 [11:29<00:00, 50.62it/s]

好好学习
迨啻绸湄

白日依山尽
朱萸掠岫赊

学而时习之
譬啻迭嵯湄

人之初性本善
譬之迭狮绸湄

1.1407968451233559
6



100%|██████████| 34903/34903 [11:35<00:00, 50.17it/s]

好好学习
矧啻绸湄

白日依山尽
沧鹂匝槛赊

学而时习之
譬啻狮嵯湄

人之初性本善
譬蹬迭非葳湄

1.1284846344640826
7



100%|██████████| 34903/34903 [11:34<00:00, 50.27it/s]

好好学习
虽啻绸陲

白日依山尽
沧萸掠屿赊

学而时习之
譬啻恣缱湄

人之初性本善
譬之钜疹锱湄

1.1220363174684718
8



 36%|███▌      | 12491/34903 [04:09<07:15, 51.46it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 34903/34903 [11:48<00:00, 49.26it/s]

好好学习
宛蹬肺荪

白日依山尽
沧鹂冒冕赊

学而时习之
矧蹬恣嵯湄

人之初性本善
譬之硬狮锱湄

1.1227452730038756
10



100%|██████████| 34903/34903 [11:24<00:00, 50.96it/s]


好好学习
虽啻麒湄

白日依山尽
扁萸冒槛赊

学而时习之
譬者互嵯湄

人之初性本善
譬者缓供锱湄

1.126403855214799
11


100%|██████████| 34903/34903 [11:31<00:00, 50.50it/s]

好好学习
讵费缙湄

白日依山尽
沧砂冒屿赊

学而时习之
矧蹬恣缱湄

人之初性本善
譬者黯竟锱湄

1.1304866279629078
12



  3%|▎         | 1138/34903 [00:22<10:59, 51.23it/s]


KeyboardInterrupt: 