### 1. Imporinting All Library

In [1]:
import yaml
import torch
import nltk
from glob import glob
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from chatbot_files.data import Dialogues
from chatbot_files.utils import set_seed

  from .autonotebook import tqdm as notebook_tqdm


### 2. Downloading the NLTK Libraries

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
import chatbot_files.utils as utils
utils.nltk_lib()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


### 3. Opening The Model Configuration
and Setting Seeds Values

In [3]:
### Make Sure the seed is imported
# from utils import set_seed

args = yaml.safe_load(open('config.yml'))
set_seed(args['seed']) 
args

{'structure_dataset_dir': './process_data/structred_data',
 'corpus_dataset_dir': './process_data/corpus_data',
 'train_frac': 0.85,
 'model_name': 'gpt2',
 'seed': 8459,
 'lr': 2e-05,
 'warmup_ratio': 0.1,
 'batch_size': 1,
 'num_epochs': 10,
 'max_len': 100,
 'max_history': 5,
 'models_dir': './models',
 'stop_command': 'bye',
 'top_p': 0.9,
 'top_k': 50,
 'temperature': 0.9,
 'mode': 'train',
 'checkpoint': './models/model_best_6.2074.h5',
 'model_dir': './models'}

### 4. Opening Loading the Model With GPU

#### 4.1 Load Tokenizer

In [4]:
def load_tokenizer(args):
    tokenizer = GPT2Tokenizer.from_pretrained(args['model_name'])
    special_tokens = ['<speaker1>', '<speaker2>']
    tokenizer.add_special_tokens({
        'bos_token': '<bos>',
        'additional_special_tokens': special_tokens
    })

    # add new token ids to args
    special_tokens += ['<bos>', '<eos>']
    sp1_id, sp2_id, bos_id, eos_id = tokenizer.encode(special_tokens)
    args['sp1_id'] = sp1_id
    args['sp2_id'] = sp2_id
    args['bos_id'] = bos_id
    args['eos_id'] = eos_id

    return tokenizer

#### 4.2 Load Model 

In [5]:
def load_model(args, tokenizer, device):
    model = GPT2LMHeadModel.from_pretrained(args["model_name"]).to(device)
    model.resize_token_embeddings(len(tokenizer))
    return model

#### 4.3 Loding Model and Tokenizeer

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
args['device'] = device

print("--"*50)
print(f'Using device: {device}')
print("--"*50)

tokenizer = load_tokenizer(args)
model = load_model(args, tokenizer, device)

----------------------------------------------------------------------------------------------------
Using device: cpu
----------------------------------------------------------------------------------------------------


### 5. Training the Model

#### 5.1 Loading the Dataset

In [7]:
def dataset_is_missing(args):
    if len(glob(f'{args["structure_dataset_dir"]}/*.pickle')) == 0:
        return True
    return False

In [8]:
## Make sure the Dialogues class is imported
# from data import Dialogues

if dataset_is_missing(args):
    print("Dataset is missing")
    print('Creating dataset...')
    dialogues = Dialogues(tokenizer, args)
    train_dataset, valid_dataset = dialogues.load()
    print(f"len(train_dataset): {len(train_dataset)}")
    dataset_types = ['train', 'valid']
    datasets = [train_dataset, valid_dataset]

    for dataset_type, dataset in zip(dataset_types, datasets):
        dialogues.save(dataset_type, tokenizer, dataset)
        
    print('Dataset created')

#### 5.2 Loading the Trainer and Start Training the model

In [9]:
### Make sure the Trainer class is imported

from chatbot_files.train import Trainer
trainer = Trainer(model, args)
# trainer.train()

Loading the optimizer...


Loading train & valid data...


100%|██████████| 27/27 [00:00<00:00, 26835.59it/s]
100%|██████████| 5/5 [00:00<00:00, 24556.81it/s]

Loading checkpoint...



  checkpoint = torch.load(path, map_location=self.args['device'])


The training restarts with the specified checkpoint: model_best_6.2074.h5


### 6. Introduction to the Chatbot

In [11]:
### Make sure the Chatbot class is imported

from chatbot_files.interact import Chatbot
args["checkpoint"] = "./models/model_best_6.2074.h5"
chatbot = Chatbot(model, tokenizer, args)
chatbot.run()

Loading checkpoint...


Found checkpoint file: model_best_6.2074.h5
Launching the chatbot...
If you want to stop, type the "bye" command
Bot: Good bye.
