In [1]:
import torch
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from torch.utils.data import DataLoader,Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
from tqdm import tqdm
from torch.nn import CrossEntropyLoss

In [67]:
import numpy as np
import copy

In [2]:
data_path = '/root/private/class14/'

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
weights = torch.load(data_path + 'small_ft.pkl')
# medium_config = GPT2Config(n_embd = 1024,n_layer = 24, n_head = 16)
small_config = GPT2Config(n_embd = 768,n_layer = 12, n_head = 12)
model = GPT2LMHeadModel(small_config)

weights['lm_head.weight'] = weights['lm_head.decoder.weight']
weights.pop('lm_head.decoder.weight',None)

model.load_state_dict(weights)
model.train()
model.to('cuda')


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [4]:
tokenizer.encode("   I  am a good boy .  ")

[40, 220, 716, 257, 922, 2933, 764]

In [6]:
tokenizer.decode([40])

'I'

In [7]:
def read_data(file):
    with open(file,'r',encoding='utf8') as data:
        lines = [l.strip() for l in data]
    dials = []
    for l in lines:
        dials += [[i.strip() for i in l.strip().split("__eou__") if i != ""]]
    return dials

In [8]:
train_data1 = data_path + 'lesson_13代码数据1.txt'
train_data2 = data_path + 'lesson_13代码数据2.txt'
train_data3 = data_path + 'lesson_13代码数据3.txt'
dials1 = read_data(train_data1)
dials2 = read_data(train_data2)
dials3 = read_data(train_data3)

In [9]:
len(dials1), len(dials2), len(dials3)

(1000, 11118, 1000)

In [10]:
class InputFeature(object):
    def __init__(self,input_ids,position_ids,token_type_ids,
                lm_labels=None,input_len=None):
        
        self.input_ids = input_ids
        self.position_ids = position_ids
        self.token_type_ids = token_type_ids
        self.lm_lanels =lm_labels
        if input_len is None:
            self.input_len = len(input_ids)
        else:
            self.input_len = input_len

In [20]:
class GPT2Dataset(Dataset):
    
    def __init__(self,dials,max_len = 1024):
        self.max_len = max_len
        self.features = build_input_feature(dials)
    
    def __getitem__(slef,i):
        feat_dict = self.features[i]
        if self.max_len is not None and feat_dict['input_len'] > self.max_len:
            feat_dict['input_ids'] = feat_dict['input_dis'][-self.max_len:]
            feat_dict['position_ids'] = feat_dict['position_ids'][-self.max_len:]
            feat_dict['token_type_ids'] = feat_dict['token_type_ids'][-self.max_len:]
            feat_dict['lm_labels'] = feat_dict['lm_labels'][-self.max_len:]
        feat = InputFeaturet(**feat_dict)
        return feat
    
    def __len__(self):
        return len(self.features)
    
    @staticmethod
    def build_input_feature(dials,end_text='<|endoftext|>'):
        '''
        此函数将对话文本信息输入，然后输出数字化的特征信息，包括input_id,position_id,token_type_id,lm_label,input_len
        '''
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        feature = []
        for dial in dials:
            '''
            对于每一个篇对话：
            
            inputs：        把第一句到倒数第二句话中所有token的index当作inputs输入，
            lm_labels：     lm_标注，inputs为-1，outputs为每个词语的token_id
            token_type_ids：token类型标注，inputs为0，outputs为1
            input_ids：     输入ids：输入token_ids + 输出token_ids（中间带<|endoftext|>分隔符）
            input_len：     输入ids的长度
            position_ids：  位置id（0 - n）
            '''
            inputs = sum([tokenizer.encode(u) for u in dial[:-1]],[]) # sum的start参数可以作为一个起始类型，[]则代表列表相加
            lm_labels = [-1]*len(inputs) + tokenizer.encode(dial[-1] + end_text) 
            token_type_ids = [0] * len(inputs) + [1.0] * (len(tokenizer.encode(dial[-1] + end_text)))
            input_ids = inputs + tokenizer.encode(end_text + dial[-1])
            input_len = len(input_ids)
            position_ids = list(range(len(input_ids)))
            
            feat_dict = {'input_ids':input_ids,
                        'position_ids':position_ids,
                        'token_type_ids':token_type_ids,
                        'lm_labels':lm_labels,
                        'input_len':input_len}
            feature.append(feat_dict)
        return feature
    
    @staticmethod
    def collate(features):
        '''
        转换为torch.tensor并做padding
        '''
        inputs_ids = pad_sequence([torch.tensor(f['input_ids'], dtype=torch.long,device=device)
                                  for f in features],batch_first=True,padding_value=0)
        
        position_ids = pad_sequence([torch.tensor(f['position_ids'], dtype=torch.long,device=device)
                                   for f in features],batch_first=True,padding_value=0)
        
        token_type_ids = pad_sequence([torch.tensor(f['token_type_ids'], dtype=torch.long,device=device)
                                      for f in features],batch_first=True,padding_value=0)
        
        labels = pad_sequence([torch.tensor(f['lm_labels'], dtype=torch.long,device=device)
                              for f in features],batch_first=True,padding_value=-1)
        
        return (inputs_ids,position_ids,token_type_ids,labels)

In [21]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [26]:
dials = dials1 + dials2 + dials3

In [13]:
dataset = GPT2Dataset.build_input_feature(dials)
loader = DataLoader(dataset,collate_fn=GPT2Dataset.collate,batch_size=1)

In [57]:
def run(model,train_dataloader,learning_rate,epoches):
    
    optimizer = Adam(model.parameters(),lr=learning_rate)
    step = 0
    epoch = 0 
    
    pbar = tqdm(enumerate(train_dataloader),total=len(train_dataloader))    # 定义进度条，pbar又同时是enumerate后的data_loader对象
    loss_function  = CrossEntropyLoss(ignore_index=-1,reduction='mean')
    while epoch < epoches:
        running_loss = 0
        try:
            with tqdm(enumerate(train_dataloader),total=len(train_dataloader)) as pbar:
                for i,batch in pbar:
                    input_ids,position_ids,token_type_ids,label_ids = batch
                    '''
                    推理函数
                    '''
                    logits = model(input_ids=input_ids.to('cuda'),position_ids=position_ids.to('cuda'),token_type_ids=token_type_ids.to('cuda'))
                    
                    lm_logits= logits[0]
            
                    loss = loss_function(lm_logits.view(-1,lm_logits.size(-1)).to('cuda'),label_ids.view(-1).to('cuda'))
                    running_loss += loss.item()
                    pbar.set_description('Train (Epoch{}):{:.4f}'.format(epoch,running_loss/(step+1)))
                    optimizer.zero_grad() # dw = 0
                    loss.backward()
                    optimizer.step() # w = w +dw
                    step += 1
                epoch += 1
        except KeyboardInterrupt:
            pbar.close()
            raise
        pbar.close()
    torch.save(model,  data_path + 'model_10.pkl')
    torch.save({'model':model.state_dict(),
               'epoch':epoch}, data_path + 'train_model_10.pth')

In [58]:
run(model,loader,1e-4,10)

  0%|          | 0/13118 [00:00<?, ?it/s]
  0%|          | 0/13118 [00:00<?, ?it/s][A
  0%|          | 0/13118 [00:00<?, ?it/s]
Train (Epoch0):1.8919: 100%|██████████| 13118/13118 [22:16<00:00,  9.82it/s]
Train (Epoch1):0.6648: 100%|██████████| 13118/13118 [22:05<00:00,  9.90it/s]
Train (Epoch2):0.3114: 100%|██████████| 13118/13118 [22:04<00:00,  9.91it/s]
Train (Epoch3):0.1716: 100%|██████████| 13118/13118 [21:45<00:00, 10.05it/s]
Train (Epoch4):0.1054: 100%|██████████| 13118/13118 [21:58<00:00,  9.95it/s]
Train (Epoch5):0.0710: 100%|██████████| 13118/13118 [23:01<00:00,  9.50it/s]
Train (Epoch6):0.0508: 100%|██████████| 13118/13118 [22:14<00:00,  9.83it/s]
Train (Epoch7):0.0381: 100%|██████████| 13118/13118 [21:59<00:00,  9.94it/s]
Train (Epoch8):0.0301: 100%|██████████| 13118/13118 [22:03<00:00,  9.91it/s]
Train (Epoch9):0.0239: 100%|██████████| 13118/13118 [22:03<00:00,  9.91it/s]


In [55]:
checkpoint = torch.load(data_path + 'train_model_1.pth')
start_epoch = checkpoint['epoch'] + 1
weights = checkpoint['model']

In [53]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
weights =checkpoint['model']
small_config = GPT2Config(n_embd = 768,n_layer = 12, n_head = 12)
model = GPT2LMHeadModel(small_config)

# weights['lm_head.weight'] = weights['lm_head.decoder.weight']
# weights.pop('lm_head.decoder.weight',None)

model.load_state_dict(weights)
model.train()
model.to('cuda')


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [59]:
model = torch.load(data_path + 'model_10.pkl')

In [65]:
model.to('cpu')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [61]:
def top_n_filtering(logits, top_n = 10, filter_value = -float('Inf')):
    assert logits.dim() == 1
    sorted_logits, sorted_indices = torch.sort(logits, descending = True)

    sorted_indices_to_remove = sorted_indices >= top_n
    sorted_indices_to_remove[..., :top_n] = 0 # set top n probabilities as 0
    sorted_indices_to_remove[..., top_n:] = 1

    indices_to_remove = sorted_indices[sorted_indices_to_remove]
    logits[indices_to_remove] = filter_value
    return logits

In [62]:
TOKEN_LEN = tokenizer.vocab_size
def predict(input_words_list):
    # Given previous words, return the probability of the next word.
    input_words = " ".join(input_words_list)
    input_ids = torch.tensor(tokenizer.encode(input_words, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
    prediction_score = outputs[0]
    logits = prediction_score[0, -1, :]
    filtered_logits = top_n_filtering(logits)
    scores = F.softmax(filtered_logits, dim=-1)
    return scores

In [63]:

    
def question_answer(question, beam_size=5):
    inputs_words = question.split(" ") + ['<|endoftext|>']
    
    final_beam_result = []
    
    # The beam search result, 
    # format: [(word_list_1, log_prob_1), (word_list_2, log_prob_2), ... ,(word_list_beam_size, log_prob_beam_size)]
    # Initial beam_result's word list is empty, means all beam searches start with the the same sentence (the input question + '<|endoftext|>')
    beam_result = [([], 0.0)] 
    
    quit = False
    while not quit:
        sum_scores = [] # Save the result of each beam_result
        for result in beam_result:
            scores = predict(inputs_words + result[0])
            sum_scores.append(torch.log(scores) + result[1]) # Use log probability to avoid too small value
        sum_scores = torch.cat(sum_scores) # Size: len(beam_result) * vocab_size
        
        # Find the top beam_size scores from all beam_result
        topk_value, topk_id = torch.topk(sum_scores, beam_size)

        # Update beam_result
        new_beam_result = []
        cnt = 0
        for v, idx in zip(topk_value, topk_id):
            idx_word = idx / TOKEN_LEN # belongs to which beam words list
            idx_item = idx % TOKEN_LEN # word's index

            predicted_words = copy.deepcopy(beam_result[idx_word][0]) # copy words list
            next_word = tokenizer.decode([idx_item]).strip()
            predicted_words.append(next_word)
            new_beam_result.append((predicted_words, v))
        
        # Filter the completed sentence in beam_result, and judge whether to quit
        # First, find the results end with '<|endoftext|>', add them to final_beam_result, and the rest add to beam_result.
        beam_result = []
        for result in new_beam_result:
            if result[0][-1] == '<|endoftext|>':
                final_beam_result.append(result) # stop to generate words, save and wait to output
                beam_size -= 1
            else:
                beam_result.append(result) # continue to generate words
        
        # Then, if there is no result in beam_result, or all results are longer than 100, quit while loop.
        if len(beam_result) == 0:
            quit = True
        for i in beam_result:
            if(len(i[0]) > 100):
                quit = True
    
    # Sort the final_beam_result by the socre which has been normalized with each result's length. 
    answer = sorted(final_beam_result, key = lambda x: x[1] / len(x[0]))
    # Find the max one
    answer = answer[-1]
    
    # Output answer
    result = " ".join(answer[0])
    if (result.endswith('<|endoftext|>')):
        result = result[:-len('<|endoftext|>')]
    print(result)
    

In [68]:
question_answer("Does money buy happiness ?")
question_answer("What is the best way to buy happiness ?")
question_answer("What is the meaning of a good life ?")
question_answer("How to be a good person ?")

At a very high school , a lot of money would be a lot of difference from a house full grown in a house house . 
You can make a good ending for a drop in your pocket , so you can get a nice economy and enjoy your goods . 
You must be familiar with the plants . 
You can say that again . 


In [69]:
question_answer("Does money buy happiness ?")
question_answer("What is the best way to buy happiness ?")
question_answer("What is the meaning of a good life ?")
question_answer("How to be a good person ?")

Sure . 
You can � down what you want to find the best way to find the best way to let your bag . 
You must be joking ! 
Take a look . 
