In [1]:
import pandas as pd
import numpy as np
import torch
from importlib import reload
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer

pl.seed_everything(42)

42

In [2]:
model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-small')

## config and tokenizer

In [3]:
import yaml

config = yaml.load(open('configs/config.yaml'), Loader=yaml.Loader)
config

{'model_name': 'microsoft/DialoGPT-small',
 'data': {'batch_size': 4,
  'max_len': 256,
  'csv_file': 'data/processed.csv',
  'val_frac': 0.1},
 'model': {'n_positions': 256,
  'n_ctx': 256,
  'n_embd': 512,
  'n_layer': 4,
  'n_head': 8,
  'n_inner': 768},
 'opt': {'lr': 5e-05, 'max_epochs': 10, 'weight_decay': 0.001}}

In [4]:
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
tokenizer.pad_token = tokenizer.eos_token

In [5]:
config['model']['vocab_size'] = tokenizer.vocab_size
config['model']['pad_token_id'] = tokenizer.pad_token_id
config['model']['eos_token_id'] = tokenizer.eos_token_id
config['model']['bos_token_id'] = tokenizer.bos_token_id
config

{'model_name': 'microsoft/DialoGPT-small',
 'data': {'batch_size': 4,
  'max_len': 256,
  'csv_file': 'data/processed.csv',
  'val_frac': 0.1},
 'model': {'n_positions': 256,
  'n_ctx': 256,
  'n_embd': 512,
  'n_layer': 4,
  'n_head': 8,
  'n_inner': 768,
  'vocab_size': 50257,
  'pad_token_id': 50256,
  'eos_token_id': 50256,
  'bos_token_id': 50256},
 'opt': {'lr': 5e-05, 'max_epochs': 10, 'weight_decay': 0.001}}

## data

In [6]:
df = pd.read_csv(config['data']['csv_file'])

In [7]:
contexted = []
n = 7
for i in tqdm(range(n, len(df['text']))):
    row = []
    prev = i - 1 - n # we additionally substract 1, so row will contain current responce and 7 previous responces  
    for j in range(i, prev, -1):
        row.append(df['text'][j])
    contexted.append(row)  
    
columns = ['response', 'context'] 
columns = columns + ['context/'+str(i) for i in range(n-1)]
df = pd.DataFrame.from_records(contexted, columns=columns)

HBox(children=(FloatProgress(value=0.0, max=103656.0), HTML(value='')))




In [8]:
import src.dataset
reload(src.dataset)
from src.dataset import get_dataloaders

train_loader, val_loader = get_dataloaders(tokenizer, df, 
                                           max_len=config['data']['max_len'], 
                                           batch_size=config['data']['batch_size'], 
                                           val_frac=config['data']['val_frac'])

train dataset has 93291 samples and val dataset has 10365 samples


In [9]:
for batch in train_loader:
    for k,v in batch.items():
        print(k,v.shape)
    break

input_ids torch.Size([4, 256])
attention_mask torch.Size([4, 256])


## model

In [10]:
import src.model
reload(src.model)
from src.model import ErfBot


model = ErfBot(**config['opt'], 
#                config=GPT2Config(**config['model']), 
               pretrained=config['model_name'])
model

ErfBot(
  (model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0): Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (res

In [11]:
# model = ErfBot.load_from_checkpoint('weights/gpt2/model.ckpt')

In [12]:
model.count_parameters()

124439808

## trainer

In [13]:
logger = TensorBoardLogger(
    save_dir='logs/',
    name='gpt2_logs'
)

checkpoint = ModelCheckpoint(dirpath='weights/gpt2', 
                             filename='{epoch}-{val_loss:.2f}', 
                             monitor='val_loss',
                             save_top_k=1, 
                             period=1)

lr_logger = LearningRateMonitor(logging_interval='step')

## defining trainer
trainer = pl.Trainer(benchmark=True, 
                  gpus=1, 
                  logger=logger, 
                  max_epochs=config['opt']['max_epochs'],
                  callbacks=[checkpoint, lr_logger])

GPU available: True, used: True
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [None]:
trainer.fit(model, train_loader, val_loader)


  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 124 M 
------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params

  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 124 M 
------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

In [None]:
# trainer.save_checkpoint('weights/gpt2/model.ckpt')

## chat

In [None]:
for step in range(10):
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    bot_input_ids = torch.cat([chat_history_ids.cuda(), new_user_input_ids.cuda()], dim=-1) if step > 0 else new_user_input_ids.cuda()

    chat_history_ids = model.model.generate(bot_input_ids, 
                                            max_length=1000, 
                                            top_p=0.9, 
                                            num_beams=8,
                                            do_sample=True, 
                                            temperature=0.8,
                                            pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

In [None]:
model.device