In [3]:
import pandas as pd
import numpy as np
import torch
from importlib import reload
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from transformers import GPT2Config, GPT2Tokenizer

pl.seed_everything(42)

42

## config and tokenizer

In [4]:
import yaml

config = yaml.load(open('configs/config.yaml'))
config

  This is separate from the ipykernel package so we can avoid doing imports until


{'data': {'batch_size': 32,
  'max_len': 256,
  'csv_file': 'data/processed.csv',
  'tokenizer_name': 'microsoft/DialoGPT-small',
  'val_frac': 0.1},
 'model': {'n_positions': 256,
  'n_ctx': 256512,
  'n_embd': 768,
  'n_layer': 6,
  'n_head': 12,
  'n_inner': 1024},
 'opt': {'lr': 5e-05, 'max_epochs': 100, 'weight_decay': 0.001}}

In [98]:
tokenizer = GPT2Tokenizer.from_pretrained(config['data']['tokenizer_name'])
tokenizer.pad_token = tokenizer.eos_token

In [115]:
config['model']['vocab_size'] = tokenizer.vocab_size
config['model']['pad_token_id'] = tokenizer.pad_token_id
config['model']['eos_token_id'] = tokenizer.eos_token_id
config['model']['bos_token_id'] = tokenizer.bos_token_id
config

{'data': {'batch_size': 32,
  'max_len': 256,
  'csv_file': 'processed.csv',
  'tokenizer_name': 'microsoft/DialoGPT-small',
  'val_frac': 0.1},
 'model': {'n_positions': 256,
  'n_ctx': 256512,
  'n_embd': 768,
  'n_layer': 6,
  'n_head': 12,
  'n_inner': 1024,
  'vocab_size': 50257,
  'pad_token_id': 50256,
  'eos_token_id': 50256,
  'bos_token_id': 50256},
 'opt': {'lr': 5e-05, 'max_epochs': 100, 'weight_decay': 0.001},
 'exp': {'save_dir': 'weights/',
  'gradient_accumulation_steps': 1,
  'max_grad_norm': 1.0}}

## data

In [5]:
df = pd.read_csv(config['data']['csv_file'])

In [6]:
contexted = []
n = 7
for i in tqdm(range(n, len(df['text']))):
    row = []
    prev = i - 1 - n # we additionally substract 1, so row will contain current responce and 7 previous responces  
    for j in range(i, prev, -1):
        row.append(df['text'][j])
    contexted.append(row)  
    
columns = ['response', 'context'] 
columns = columns + ['context/'+str(i) for i in range(n-1)]
df = pd.DataFrame.from_records(contexted, columns=columns)

  0%|          | 0/103656 [00:00<?, ?it/s]

In [109]:
import dataset
reload(dataset)
from dataset import get_dataloaders

train_loader, val_loader = get_dataloaders(tokenizer, df, 
                                           max_len=config['data']['max_len'], 
                                           batch_size=config['data']['batch_size'], 
                                           val_frac=config['data']['val_frac'])

train dataset has 93291 samples and val dataset has 10365 samples


## model

In [None]:
import model
reload(model)
from model import ErfBot


model = ErfBot(config=GPT2Config(**config['model']), 
               **config['opt'])
model

In [None]:
model.count_parameters()

## trainer

In [None]:
logger = TensorBoardLogger(
    save_dir='logs/',
    name='gpt2_logs'
)

checkpoint = ModelCheckpoint(dirpath='weights/gpt2', 
                             filename='{epoch}-{val_loss:.2f}', 
                             monitor='val_loss',
                             save_top_k=1, 
                             period=1)

lr_logger = LearningRateMonitor(logging_interval='step')

## defining trainer
trainer = Trainer(benchmark=True, 
                  gpus=1, 
                  logger=logger, 
                  max_epochs=config['opt']['max_epochs'],
                  callbacks=[checkpoint, lr_logger])

In [None]:
trainer.fit(model, train_loader, val_loader)