In [None]:
import transformers
import torch
import os
import json
import random
import numpy as np
import argparse
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from tqdm import tqdm
from torch.nn import DataParallel
from tokenizations.bpe_tokenizer import get_encoder


# 預處理資料集
- 把初始句和回答結合成一個文本 (data point)
- 只保留回答的情緒 (在 prediction 時該情緒可以用來引導情緒回達)
- 載入 tokenizer 然後加入新的情緒 token
- 用 tokenizer 把文本轉換成 ids 並存檔
- 存檔加入新情緒 token 的 tokenizer
- dataset : https://www.biendata.xyz/ccf_tcci2018/datasets/ecg/

In [None]:
"""
s2t.json Simplified Chinese to Traditional Chinese 簡體到繁體
t2s.json Traditional Chinese to Simplified Chinese 繁體到簡體
s2tw.json Simplified Chinese to Traditional Chinese (Taiwan Standard) 簡體到臺灣正體
tw2s.json Traditional Chinese (Taiwan Standard) to Simplified Chinese 臺灣正體到簡體
s2hk.json Simplified Chinese to Traditional Chinese (Hong Kong variant) 簡體到香港繁體
hk2s.json Traditional Chinese (Hong Kong variant) to Simplified Chinese 香港繁體到簡體
s2twp.json Simplified Chinese to Traditional Chinese (Taiwan Standard) with Taiwanese idiom 簡體到繁體（臺灣正體標準）並轉換爲臺灣常用詞彙
tw2sp.json Traditional Chinese (Taiwan Standard) to Simplified Chinese with Mainland Chinese idiom 繁體（臺灣正體標準）到簡體並轉換爲中國大陸常用詞彙
t2tw.json Traditional Chinese (OpenCC Standard) to Taiwan Standard 繁體（OpenCC 標準）到臺灣正體
hk2t.json Traditional Chinese (Hong Kong variant) to Traditional Chinese 香港繁體到繁體（OpenCC 標準）
t2hk.json Traditional Chinese (OpenCC Standard) to Hong Kong variant 繁體（OpenCC 標準）到香港繁體
t2jp.json Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji (Shinjitai) 繁體（OpenCC 標準，舊字體）到日文新字體
jp2t.json New Japanese Kanji (Shinjitai) to Traditional Chinese Characters (Kyūjitai) 日文新字體到繁體（OpenCC 標準，舊字體）
tw2t.json Traditional Chinese (Taiwan standard) to Traditional Chinese 臺灣正體到繁體（OpenCC 標準）
"""

In [None]:
import opencc
emotion_dict = {0: '[其他]', \
                1: '[喜歡]', \
                2: '[悲傷]', \
                3: '[噁心]', \
                4: '[憤怒]', \
                5: '[喜樂]'}

converter = opencc.OpenCC('s2t.json')



import json
with open("data/ecg_train_data.json", encoding="utf-8") as f:
    lines = json.load(f)
    processed = []
    for line in lines:
        post = converter.convert(line[0][0]).strip() # 起始句轉成繁體, 丟掉情緒標籤
        emo = emotion_dict[line[1][1]] #回答句的情緒標籤
        reply = converter.convert(line[1][0]).strip() #回答句標籤轉成中文放在句首，並轉成繁體
        processed.extend([post + emo + reply])
        #print(line)

with open("data/ecg_train_data_processed.json", 'w') as fi:
    json.dump(processed, fi)


In [None]:
with open("data/ecg_train_data_processed.json", encoding="utf-8") as f:
    lines = json.load(f)
    for line in lines:
        print(line)
        break

In [None]:
# 資料轉換成 token ids 並儲存
def build_files(data_path, tokenized_data_path, num_pieces, full_tokenizer, min_length):
    #num_pieces 将训练语料分成多少份
    with open(data_path, 'r', encoding='utf8') as f:
        print('reading lines')
        lines = json.load(f)
        lines = [line.replace('\n', ' [SEP] ') for line in lines]  # 用[SEP]表示换行, 段落之间使用SEP表示段落结束
    all_len = len(lines)
    if not os.path.exists(tokenized_data_path):
        os.mkdir(tokenized_data_path)
    for i in tqdm(range(num_pieces)):
        sublines = lines[all_len // num_pieces * i: all_len // num_pieces * (i + 1)]
        if i == num_pieces - 1:
            sublines.extend(lines[all_len // num_pieces * (i + 1):])  # 把尾部例子添加到最后一个piece
        sublines = [full_tokenizer.tokenize(line) for line in sublines if
                    len(line) > min_length]  # 只考虑长度超过min_length的句子
        sublines = [full_tokenizer.convert_tokens_to_ids(line) for line in sublines]
        full_line = []
        for subline in sublines:
            full_line.append(full_tokenizer.convert_tokens_to_ids('[MASK]'))  # 文章开头添加MASK表示文章开始
            full_line.extend(subline)
            full_line.append(full_tokenizer.convert_tokens_to_ids('[CLS]'))  # 文章之间添加CLS表示文章结束
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'w') as f:
            for id in full_line:
                f.write(str(id) + ' ')
    print('finish')


In [None]:

## add special tokens
from tokenizations import tokenization_bert_word_level as tokenization_bert
added_tokens = {'additional_special_tokens':['[其他]', '[喜歡]', '[悲傷]', '[噁心]', '[憤怒]', '[喜樂]']}

full_tokenizer = tokenization_bert.BertTokenizer(vocab_file='pretrained_model/vocab.txt')  
full_tokenizer.max_len = full_tokenizer.add_special_tokens(added_tokens)   #要把 additional_special_tokens 這個 Key 加入 list of your special tokens, 其他例如 cls 他本身就有排好 cls key了


#model.resize_token_embeddings(len(full_tokenizer))

In [None]:
## test the added speical tokens
print(full_tokenizer.additional_special_tokens)
print(full_tokenizer.additional_special_tokens_ids)

In [1]:
# save updated tokenizer

import torch
torch.save(full_tokenizer, "manmade/tokenizer.ckpt")


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'full_tokenizer' is not defined

In [2]:
## build file for training

import os
from tqdm import tqdm
build_files("data/ecg_train_data_processed.json", "data/", 100, full_tokenizer, 0)

NameError: name 'build_files' is not defined

# Load Model
- load pretrained model
- expand embedding dimensions for added special tokens
- forward to training section

In [3]:
import torch
full_tokenizer = torch.load("manmade/tokenizer.ckpt", map_location=torch.device('cpu'))

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


Model loaded succeed


In [10]:
import transformers
pretrained_model_loc = "./pretrained_model/"
#model = transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.from_pretrained(config='./pretrained_model/config.json', 
#                                                                            pretrained_weights='./pretrained_model/pytorch_model.bin', 
#                                                                            vocab_file='./pretrained_model/vocab.txt')
model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained("./pretrained_model/")
model.resize_token_embeddings(len(full_tokenizer))

Embedding(21134, 768)

In [17]:
model_config = transformers.modeling_gpt2.GPT2Config.from_json_file("./pretrained_model/config.json")
print('config:\n' + model_config.to_json_string())
n_ctx = model_config.n_ctx
print(f'n_ctx: {n_ctx}')

config:
{
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 400
    }
  },
  "tokenizer_class": "BertTokenizer",
  "torchscript": false,
  "use_bfloat16": false,
  "vocab_size": 21128
}

n_ctx: 1024


## TODO:
- 然後 model 的 vocab 記得要先擴充，因為 tokenizer 擴充了 而且辭典也擴充了 
- 新的 vocab 應該也要存在某個地方，這樣新 init 的 tokenizer 才能讀取

In [11]:
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21134, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

# Training

In [None]:
epochs = 10
num_pieces = 100
batch_size = 8
stride = 768 #训练时取训练数据的窗口步长
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
multi_gpu = False
gradient_accumulation = 1 #'梯度积累'

total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation)
lr =1.5e-4
warmup_steps = 2000

optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps,
                                                        t_total=total_steps)
fp16 = False # 混合精度 # 不支持半精度的显卡请勿打开
overall_step = 0
#tb_writer = SummaryWriter(log_dir=args.writer_dir)
log_step = 1 #'多少步汇报一次loss，设置为gradient accumulation的整数倍'
max_grad_norm = 1.0


for epoch in range(epochs):
        print('epoch {}'.format(epoch + 1))
        now = datetime.now()
        print('time: {}'.format(now))
        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
        random.shuffle(x)
        piece_num = 0
        for i in x:
            with open('data/' + 'tokenized_train_{}.txt'.format(i), 'r') as f:
                line = f.read().strip()
            tokens = line.split()
            tokens = [int(token) for token in tokens]
            start_point = 0
            samples = []
            while start_point < len(tokens) - n_ctx:
                samples.append(tokens[start_point: start_point + n_ctx])
                start_point += stride
            if start_point < len(tokens):
                samples.append(tokens[len(tokens)-n_ctx:])
            random.shuffle(samples)
            for step in range(len(samples) // batch_size):  # drop last

                #  prepare data
                batch = samples[step * batch_size: (step + 1) * batch_size]
                batch_inputs = []
                for ids in batch:
                    int_ids = [int(x) for x in ids]
                    batch_inputs.append(int_ids)
                batch_inputs = torch.tensor(batch_inputs).long().to(device)

                #  forward pass
                outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs)
                loss, logits = outputs[:2]

                #  get loss
                if multi_gpu:
                    loss = loss.mean()
                if gradient_accumulation > 1:
                    loss = loss / gradient_accumulation

                #  loss backward
                #if fp16:
                #    with amp.scale_loss(loss, optimizer) as scaled_loss:
                #        scaled_loss.backward()
                #        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
                #else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

                #  optimizer step
                if (overall_step + 1) % gradient_accumulation == 0:
                    running_loss += loss.item()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                if (overall_step + 1) % log_step == 0:
                    #tb_writer.add_scalar('loss', loss.item() * gradient_accumulation, overall_step)
                    print('now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'.format(
                        datetime.now().hour,
                        datetime.now().minute,
                        step + 1,
                        piece_num,
                        epoch + 1,
                        running_loss * gradient_accumulation / (log_step / gradient_accumulation)))
                    running_loss = 0
                overall_step += 1
            piece_num += 1

        print('saving model for epoch {}'.format(epoch + 1))
        if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
            os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
        # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
        # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))
        print('epoch {} finished'.format(epoch + 1))

        then = datetime.now()
        print('time: {}'.format(then))
        print('time for one epoch: {}'.format(then - now))

    print('training finished')