In [1]:
# #Run this code if you are in google colab
# !git clone https://github.com/rahulbharti5/college-chatbot-gpt2 .
# !pip install datasets

### 1. Imporinting All Library

In [2]:
import yaml
import torch
import nltk
from glob import glob
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from chatbot_files.data import Dialogues
from chatbot_files.utils import set_seed

  from .autonotebook import tqdm as notebook_tqdm


### 2. Downloading the NLTK Libraries

In [3]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### 3. Opening The Model Configuration
and Setting Seeds Values

In [4]:
### Make Sure the seed is imported
# from utils import set_seed

args = yaml.safe_load(open('config.yml'))
set_seed(args['seed']) 
args

{'structure_dataset_dir': './process_data/structred_data',
 'corpus_dataset_dir': './process_data/corpus_data',
 'train_frac': 0.85,
 'model_name': 'gpt2',
 'seed': 8459,
 'lr': 2e-05,
 'warmup_ratio': 0.1,
 'batch_size': 1,
 'num_epochs': 10,
 'max_len': 100,
 'max_history': 5,
 'models_dir': './models',
 'stop_command': 'bye',
 'top_p': 0.9,
 'top_k': 50,
 'temperature': 0.9,
 'mode': 'train',
 'checkpoint': 'None',
 'model_dir': './models'}

### 4. Opening Loading the Model With GPU

#### 4.1 Load Tokenizer

In [5]:
def load_tokenizer(args):
    tokenizer = GPT2Tokenizer.from_pretrained(args['model_name'])
    special_tokens = ['<speaker1>', '<speaker2>']
    tokenizer.add_special_tokens({
        'bos_token': '<bos>',
        'additional_special_tokens': special_tokens
    })

    # add new token ids to args
    special_tokens += ['<bos>', '<eos>']
    sp1_id, sp2_id, bos_id, eos_id = tokenizer.encode(special_tokens)
    args['sp1_id'] = sp1_id
    args['sp2_id'] = sp2_id
    args['bos_id'] = bos_id
    args['eos_id'] = eos_id

    return tokenizer

#### 4.2 Load Model 

In [6]:
def load_model(args, tokenizer, device):
    model = GPT2LMHeadModel.from_pretrained(args["model_name"]).to(device)
    model.resize_token_embeddings(len(tokenizer))
    return model

#### 4.3 Loding Model and Tokenizeer

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
args['device'] = device

print("--"*50)
print(f'Using device: {device}')
print("--"*50)

tokenizer = load_tokenizer(args)
model = load_model(args, tokenizer, device)

----------------------------------------------------------------------------------------------------
Using device: cpu
----------------------------------------------------------------------------------------------------


In [8]:
from chatbot_files.data import CorpusDataSet
args['max_length'] = args['max_len']
corapus_train = CorpusDataSet("train",args)

In [9]:
from chatbot_files.utils import PadCollateCorpus

pad = PadCollateCorpus(args)

from torch.utils.data import DataLoader
dataloader1 =    DataLoader(corapus_train,
                            collate_fn=pad,
                            shuffle=True,
                            batch_size=args['batch_size'],
                            num_workers=1,
                            pin_memory=True)

In [10]:
from tqdm.auto import tqdm
for i, batch in enumerate(tqdm(dataloader1)):
    print(len(batch))
    break

  0%|          | 0/44 [00:00<?, ?it/s]

2





In [11]:
from chatbot_files.train import Corpus_Trainer
trainer = Corpus_Trainer(model, args)
trainer.train()

Loading the optimizer...


Loading train & valid data...
Can't find the specified checkpoint
Launch training...
--------------------------------------------------
Epoch: 1
--------------------------------------------------


100%|██████████| 44/44 [03:22<00:00,  4.60s/it]


Train loss: 51.07001772793856 
Train perplexity: 4.431107226863684e+35
Launch validation...


100%|██████████| 8/8 [00:02<00:00,  3.14it/s]


Checkpoint saved: ./models/model_best_17.6164.h5
Best valid loss: 17.6164470911026
Valid loss: 17.6164470911026 
Valid perplexity: 1438511230.7337646
--------------------------------------------------
Epoch: 2
--------------------------------------------------


100%|██████████| 44/44 [01:10<00:00,  1.60s/it]


Train loss: 7.461894994432276 
Train perplexity: 57944.3537338647
Launch validation...


100%|██████████| 8/8 [00:02<00:00,  3.26it/s]


Checkpoint saved: ./models/model_best_7.7663.h5
Best valid loss: 7.766345262527466
Valid loss: 7.766345262527466 
Valid perplexity: 11050.851471424103
--------------------------------------------------
Epoch: 3
--------------------------------------------------


100%|██████████| 44/44 [01:07<00:00,  1.54s/it]


Train loss: 5.664989693598314 
Train perplexity: 773.3494353294373
Launch validation...


100%|██████████| 8/8 [00:02<00:00,  3.34it/s]


Checkpoint saved: ./models/model_best_7.0646.h5
Best valid loss: 7.06463423371315
Valid loss: 7.06463423371315 
Valid perplexity: 3078.763620853424
--------------------------------------------------
Epoch: 4
--------------------------------------------------


100%|██████████| 44/44 [01:04<00:00,  1.48s/it]


Train loss: 5.419533079320734 
Train perplexity: 540.6225720752369
Launch validation...


100%|██████████| 8/8 [00:02<00:00,  3.17it/s]


Checkpoint saved: ./models/model_best_6.7155.h5
Best valid loss: 6.715542197227478
Valid loss: 6.715542197227478 
Valid perplexity: 2067.341737985611
--------------------------------------------------
Epoch: 5
--------------------------------------------------


100%|██████████| 44/44 [01:18<00:00,  1.78s/it]


Train loss: 5.32859448411248 
Train perplexity: 468.2930775555697
Launch validation...


100%|██████████| 8/8 [00:02<00:00,  3.30it/s]


RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 718434752 vs 718434644