In [2]:
# #Run this code if you are in google colab
# !git clone https://github.com/rahulbharti5/college-chatbot-gpt2 .
# !pip install datasets

### 1. Imporinting All Library

In [3]:
import yaml
import torch
import nltk
from glob import glob
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from chatbot_files.data import Dialogues
from chatbot_files.utils import set_seed

  from .autonotebook import tqdm as notebook_tqdm


### 2. Downloading the NLTK Libraries

In [4]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...


[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/rahulbharti/Preojects/college-chatbot-
[nltk_data]     gpt2/venv/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### 3. Opening The Model Configuration
and Setting Seeds Values

In [None]:
### Make Sure the seed is imported
# from utils import set_seed

args = yaml.safe_load(open('config.yml'))
set_seed(args['seed']) 
args

{'structure_dataset_dir': './process_data/structred_data',
 'corpus_dataset_dir': './process_data/corpus_data',
 'train_frac': 0.85,
 'model_name': 'gpt2',
 'seed': 8459,
 'lr': 2e-05,
 'warmup_ratio': 0.1,
 'batch_size': 1,
 'num_epochs': 10,
 'max_len': 100,
 'max_history': 5,
 'models_dir': './models',
 'stop_command': 'bye',
 'top_p': 0.9,
 'top_k': 50,
 'temperature': 0.9,
 'mode': 'train',
 'checkpoint': 'None',
 'model_dir': './models'}

### 4. Opening Loading the Model With GPU

#### 4.1 Load Tokenizer

In [6]:
def load_tokenizer(args):
    tokenizer = GPT2Tokenizer.from_pretrained(args['model_name'])
    special_tokens = ['<speaker1>', '<speaker2>']
    tokenizer.add_special_tokens({
        'bos_token': '<bos>',
        'additional_special_tokens': special_tokens
    })

    # add new token ids to args
    special_tokens += ['<bos>', '<eos>']
    sp1_id, sp2_id, bos_id, eos_id = tokenizer.encode(special_tokens)
    args['sp1_id'] = sp1_id
    args['sp2_id'] = sp2_id
    args['bos_id'] = bos_id
    args['eos_id'] = eos_id

    return tokenizer

#### 4.2 Load Model 

In [7]:
def load_model(args, tokenizer, device):
    model = GPT2LMHeadModel.from_pretrained(args["model_name"]).to(device)
    model.resize_token_embeddings(len(tokenizer))
    return model

#### 4.3 Loding Model and Tokenizeer

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
args['device'] = device

print("--"*50)
print(f'Using device: {device}')
print("--"*50)

tokenizer = load_tokenizer(args)
model = load_model(args, tokenizer, device)

----------------------------------------------------------------------------------------------------
Using device: cpu
----------------------------------------------------------------------------------------------------


In [9]:
from chatbot_files.processing import Processing

In [10]:
processing = Processing(tokenizer, args['train_frac'])

In [11]:
train,validation = processing._load_daily()

100%|██████████| 13118/13118 [00:10<00:00, 1297.94it/s]


In [12]:
train[0:10]

[['Say, Jim, How about going for a few beers after dinner?',
  'You know that is tempting but is really not good for our fitness.',
  'What do you mean? It will help us to relax.',
  "Do you really think so? I don't. It will just make us fat and act silly. Remember last time?",
  "I guess you are right. But what shall we do? I don't feel like sitting at home.",
  'I suggest a walk over to the gym where we can play singsong and meet some of our friends.',
  "That's a good idea. I hear Mary and Sally often go there to play pingpong. Perhaps we can make a foursome with them.",
  'Sounds great to me! If they are willing, We could ask them to go dancing with us. That is excellent exercise and fun, Too.',
  "Good. Let's go now.",
  'All right.'],
 ['Can you do push-ups?',
  "Of course I can. It's a piece of cake! Believe it or not, I can do 30 push-ups a minute.",
  "Really? I think that's impossible!",
  'You mean 30 push-ups?',
  'Yeah!',
  "It's easy. If you do exercise everyday, You can 

In [13]:
test_data = "How are you? d"
print(test_data)
token_list = tokenizer.tokenize(test_data.strip().replace('’', '\'')) 
print(token_list)
token_list = processing._process_token_list(token_list)
print(token_list)
new_data = tokenizer.convert_tokens_to_string(token_list)
print(new_data)

How are you? d
['How', 'Ġare', 'Ġyou', '?', 'Ġd']
['How', 'Ġare', 'Ġyou', '?', 'D', '.']
How are you?D.


In [14]:
from tqdm.auto import tqdm
for dialogue in tqdm(train):
    for utter in dialogue:
        tokens = tokenizer.tokenize(utter)
        token_ids = tokenizer.encode(tokens)
        print(utter)
        print(tokens)
        print(token_ids)
        print(tokenizer.decode(token_ids))
        break
    break

  0%|          | 0/11150 [00:00<?, ?it/s]

Say, Jim, How about going for a few beers after dinner?
['Say', ',', 'ĠJim', ',', 'ĠHow', 'Ġabout', 'Ġgoing', 'Ġfor', 'Ġa', 'Ġfew', 'Ġbeers', 'Ġafter', 'Ġdinner', '?']
[25515, 11, 5395, 11, 1374, 546, 1016, 329, 257, 1178, 16800, 706, 8073, 30]
Say, Jim, How about going for a few beers after dinner?





In [15]:
from chatbot_files.data import DialoguesDataset
train_dataset = DialoguesDataset('train', args)

100%|██████████| 27/27 [00:00<00:00, 32929.98it/s]


In [29]:
test1 = train_dataset[0:10]
for dar in test1[0]:
    # print(dar)
    # break
    # print(len(dar))
    print(tokenizer.decode(dar))

<bos> <speaker1> When was Rajkiya Engineering College (R. E. C.) Ambedkar Nagar established? <speaker2> Rajkiya Engineering College (R. E. C.) Ambedkar Nagar was established in 2010 by the Government of Uttar Pradesh under a special component plan.<|endoftext|>
<bos> <speaker2> Rajkiya Engineering College (R. E. C.) Ambedkar Nagar was established in 2010 by the Government of Uttar Pradesh under a special component plan. <speaker1> What were the initial branches offered by the college? <speaker2> The college initially offered B. Tech. Programs in three branches: Information Technology (IT), Electrical Engineering (EE), And Civil Engineering (CE), With an intake of 60 students in each branch.<|endoftext|>
<bos> <speaker1> How long did it take for the college to shift to its own campus? <speaker2> The college initially operated from the campus of Kamla Nehru Institute of Technology (K. N. I.T) in Sultanpur. It shifted to its own campus in Ambedkar Nagar in August 2012, About two years aft

In [33]:
from chatbot_files.data import CorpusDataSet
train_dataset = CorpusDataSet('train', args)

In [32]:
test1 = train_dataset[0:10]
for dar in test1[0]:
    # print(dar)
    # break
    # print(len(dar))
    print(tokenizer.decode(dar))

Rajkiya Engineering College (R. E. C.) Ambedkar Nagar was established by Government of Uttar Pradesh under special component plan in year 2010, The college has started offering B. Tech Programme in three disciplines – Information Technology (IT), Electrical Engineering (EE) and Civil Engineering (CE) with intake of 60 seats in each branches from the session 2010-11.!!!!!!!!!!!!!!!!!!!!!!!
Rajkiya Engineering College, Ambedkar Nagar is one of the best and most reputable government engineering colleges in the state of Uttar Pradesh. Rec Ambedkar Nagar has always been excelling both on the academic and the non-academic fronts. Rajkiya Engineering College, Ambedkar Nagar is an AICTE-approved government engineering college with a well-established library and labs. Every year a large number of students from the college clear the GATE exam and get
The work of college construction started in October 2010 with a budget of 6213. 81Lakh. The college was running in the campus of Kamla Nehru Instit

In [17]:
from chatbot_files.data import Corpus
corpus = Corpus(tokenizer,args)
train,valid = corpus.load()

In [21]:
train_dataset = DialoguesDataset('train', args)

100%|██████████| 27/27 [00:00<00:00, 34327.43it/s]


In [22]:
train,type,labels = train_dataset.__getitem__(0)

In [23]:
from chatbot_files.utils import PadCollate

pad = PadCollate(args)

from torch.utils.data import DataLoader
dataloader =    DataLoader(train_dataset,
                            collate_fn=pad,
                            shuffle=True,
                            batch_size=args['batch_size'],
                            num_workers=1,
                            pin_memory=True)


In [24]:
for i, batch in enumerate(tqdm(dataloader)):
    input_ids, token_type_ids, labels = batch
    print(len(batch))
    print(batch)
    break

  0%|          | 0/48 [00:00<?, ?it/s]

3
(tensor([[50257, 50258,  8241,   318,   262,   367,  3727,   286,   262, 35262,
           286,  6188,  8987,    30, 50259,  6187,    13, 14818, 43573,   283,
         18383, 44202,   318,   262, 22669,  8129,   357, 40164,     8,  1222,
           367,  3727,   286,   262, 35262,   286,  6188,  8987,    13, 50256]]), tensor([[50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259]]), tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  6187,    13, 14818, 43573,   283,
         18383, 44202,   318,   262, 22669,  8129,   357, 40164,     8,  1222,
           367,  3727,   286,   262, 35262,   286,  6188,  8987,    13, 50256]]))


  0%|          | 0/48 [00:00<?, ?it/s]


In [25]:
from chatbot_files.data import CorpusDataSet
args['max_length'] = args['max_len']
corapus_train = CorpusDataSet("train",args)

In [26]:
from chatbot_files.utils import PadCollateCorpus

pad = PadCollateCorpus(args)

from torch.utils.data import DataLoader
dataloader1 =    DataLoader(corapus_train,
                            collate_fn=pad,
                            shuffle=True,
                            batch_size=args['batch_size'],
                            num_workers=1,
                            pin_memory=True)

In [27]:

for i, batch in enumerate(tqdm(dataloader1)):
    # input_ids,labels = batch
    print(batch)
    break




  0%|          | 0/44 [00:00<?, ?it/s]

(tensor([[36261,   286,  1321,  3037,   318,   262,  4387,  5011,   286,   262,
          5136,    13,   632,  4394,   347,    13,  9634,    13,   554,  6188,
          3037,    13,   383,  2732,   318,   880, 10911,   351,  1029,   886,
          9061,    11, 26603,  3788,  1222,  7283,  6884,    13,  1439, 14492,
          4133,   389, 40582,   351,  1029,  2866,  5230,    13,   383,  7611,
          3094,  7311,   278,  6841,   318,   635,  5257,   416,   262,  5011,
            13,   383,  2732,   468,   257,   880,    12, 22557, 12829,   290,
          1811,   880, 10911, 35650, 39211,   284,   262,  2476,   286,   407,
           691,   262,  7283,   475,   635,  2444,   422,   584, 13346,    13,
           383,  2732,   468,  5079, 10337,   286,  3126,   287,   347,    13]]), tensor([[  286,  1321,  3037,   318,   262,  4387,  5011,   286,   262,  5136,
            13,   632,  4394,   347,    13,  9634,    13,   554,  6188,  3037,
            13,   383,  2732,   318,   880, 1091




In [28]:
corpus = Corpus(tokenizer,args)
train,validate = corpus._college_corpus()
dataset_types = ['train', 'valid']
datasets = [train, validate]
for dataset_type, dataset in zip(dataset_types, datasets):
    corpus.save(dataset_type, tokenizer, dataset)

Saving train Corpus to file...
Saving train ids to file...


100%|██████████| 44/44 [00:00<00:00, 1283.30it/s]


Saving complete!
Saving valid Corpus to file...
Saving valid ids to file...


100%|██████████| 8/8 [00:00<00:00, 1065.66it/s]

Saving complete!



