In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

In [None]:
! pip -q install transformers

[K     |████████████████████████████████| 5.5 MB 8.8 MB/s 
[K     |████████████████████████████████| 163 kB 66.1 MB/s 
[K     |████████████████████████████████| 7.6 MB 49.1 MB/s 
[?25h

In [None]:
!pip -q install tensorboardX

[?25l[K     |██▋                             | 10 kB 27.2 MB/s eta 0:00:01[K     |█████▎                          | 20 kB 5.6 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 7.9 MB/s eta 0:00:01[K     |██████████▌                     | 40 kB 5.4 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 5.3 MB/s eta 0:00:01[K     |███████████████▊                | 61 kB 6.3 MB/s eta 0:00:01[K     |██████████████████▎             | 71 kB 6.7 MB/s eta 0:00:01[K     |█████████████████████           | 81 kB 6.1 MB/s eta 0:00:01[K     |███████████████████████▌        | 92 kB 6.7 MB/s eta 0:00:01[K     |██████████████████████████▏     | 102 kB 6.2 MB/s eta 0:00:01[K     |████████████████████████████▊   | 112 kB 6.2 MB/s eta 0:00:01[K     |███████████████████████████████▍| 122 kB 6.2 MB/s eta 0:00:01[K     |████████████████████████████████| 125 kB 6.2 MB/s 
[?25h

## Import Libraries

In [None]:
import pickle, random, re, torch, pandas as pd, numpy as np
from typing import Dict, List, Tuple
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm, trange
from pathlib import Path
from transformers import (AutoConfig,
                          AutoModelForCausalLM,
                          AutoTokenizer,
                          PreTrainedModel,
                          PreTrainedTokenizer,
                          get_linear_schedule_with_warmup)
from tensorboardX import SummaryWriter
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from pathlib import Path
model_size = "small" 

## Defining Arguments

In [None]:
# Args to allow for easy convertion of python script to notebook
class Args():
    def __init__(self):
        self.output_dir = f'/content/drive/MyDrive/ChatBotProject/output/output-{model_size}'
        self.model_type = 'gpt2'
        self.model_name_or_path = f'microsoft/DialoGPT-{model_size}'
        self.config_name = f'microsoft/DialoGPT-{model_size}'
        self.tokenizer_name = f'microsoft/DialoGPT-{model_size}'
        self.cache_dir = '/content/drive/MyDrive/ChatBotProject/cached'
        self.block_size = 512
        self.per_gpu_train_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 50 
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_total_limit = None
        self.seed = 42
        self.local_rank = -1

args = Args()

## Data Loading

In [None]:
data = pd.read_csv("/content/drive/MyDrive/ChatBotProject/input/Chatbot_custom_data.csv")
data.head()

Unnamed: 0,Questions By Customer,Response By Chatbot
0,hi bot are you there?,"Hello, I am Adam and i am here to assist you t..."
1,hi are you there?,"Hi, I am Adam and welcome to our restaurant"
2,are you there?,"Hi, I am Adam and welcome to our restaurant"
3,there?,"Hi, I am Adam and welcome to our restaurant"
4,hey how are you,I am feeling very good What about you?


In [None]:
ques=data['Questions By Customer'].values
anses=data['Response By Chatbot'].values
print("Size of Questions : {} - Size of Answers : {}".format(len(ques),len(anses)))

Size of Questions : 378 - Size of Answers : 378


In [None]:
conversations=[]
for i in range(len(ques)):
  conversations.append(ques[i])
  conversations.append(anses[i])

## Creating Contexts

In [None]:
contexted = []
n = 7
for i in range(n, len(conversations)):
  row = []
  prev = i - 1 - n
  for j in range(i, prev, -1):
    row.append(conversations[j])
  contexted.append(row) 
columns = ['response'] + ['context '+str(i+1) for i in range(n)]
df = pd.DataFrame.from_records(contexted, columns=columns)
df.head()

Unnamed: 0,response,context 1,context 2,context 3,context 4,context 5,context 6,context 7
0,"Hi, I am Adam and welcome to our restaurant",there?,"Hi, I am Adam and welcome to our restaurant",are you there?,"Hi, I am Adam and welcome to our restaurant",hi are you there?,"Hello, I am Adam and i am here to assist you t...",hi bot are you there?
1,hey how are you,"Hi, I am Adam and welcome to our restaurant",there?,"Hi, I am Adam and welcome to our restaurant",are you there?,"Hi, I am Adam and welcome to our restaurant",hi are you there?,"Hello, I am Adam and i am here to assist you t..."
2,I am feeling very good What about you?,hey how are you,"Hi, I am Adam and welcome to our restaurant",there?,"Hi, I am Adam and welcome to our restaurant",are you there?,"Hi, I am Adam and welcome to our restaurant",hi are you there?
3,hey i hope you are doing good,I am feeling very good What about you?,hey how are you,"Hi, I am Adam and welcome to our restaurant",there?,"Hi, I am Adam and welcome to our restaurant",are you there?,"Hi, I am Adam and welcome to our restaurant"
4,i am great what about you,hey i hope you are doing good,I am feeling very good What about you?,hey how are you,"Hi, I am Adam and welcome to our restaurant",there?,"Hi, I am Adam and welcome to our restaurant",are you there?


In [None]:
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

def load_and_cache_examples(args, tokenizer, df_trn):
    return ConversationDataset(tokenizer, args, df_trn)

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)
        directory = args.cache_dir
        cached_features_file = os.path.join(directory, args.model_type + "_cached_lm_" + str(block_size))
        self.examples = []
        for _, row in df.iterrows():
            conv = construct_conv(row, tokenizer)
            self.examples.append(conv)
        with open(cached_features_file, "wb") as handle:
            pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [None]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True)
    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    model = model.module if hasattr(model, "module") else model
    model.resize_token_embeddings(len(tokenizer))
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    global_step, epochs_trained = 0, 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss
    tb_writer.close()
    return global_step, tr_loss / global_step,1-(tr_loss / global_step)

In [None]:
def main(df_trn):
    args = Args()
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device
    set_seed(args)
    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, from_tf=False, config=config, cache_dir=args.cache_dir)
    model.to(args.device)
    train_dataset = load_and_cache_examples(args, tokenizer, df_trn)
    global_step, overall_loss, overall_accuracy = train(args, train_dataset, model, tokenizer)
    os.makedirs(args.output_dir, exist_ok=True)
    model_to_save = (model.module if hasattr(model, "module") else model)
    model_to_save.save_pretrained(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)
    torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
    return overall_loss, overall_accuracy

## Model Training

In [None]:
overall_loss, overall_accuracy = main(df)
print("Overall loss : ",overall_loss," --- ","Overall accuracy : ",overall_accuracy)

Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/351M [00:00<?, ?B/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Iteration:   0%|          | 0/187 [00:00<?, ?it/s]

Overall loss :  0.09596039436080239  ---  Overall accuracy :  0.9040396056391976


## Thank You