In [3]:
from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path:str, tokenizer):
        self.data = json.load(open(path, "r"))

        self.X = []
        for i in self.data:
            for j in i['dialog']:
                self.X.append(j['text'])

        for idx, i in enumerate(self.X):
            try:
                self.X[idx] = "<startofstring> "+i+" <bot>: "+self.X[idx+1]+" <endofstring>"
            except:
                break

        self.X = self.X[:5000]
        
        print(self.X[0])

        self.X_encoded = tokenizer(self.X,max_length=40, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from ChatData import ChatData
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

def train(chatData, model, optim):

    epochs = 12

    for i in tqdm.tqdm(range(epochs)):
        for X, a in chatData:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "model_state.pt")
        print(infer("hello how are you"))

def infer(inp):
    inp = "<startofstring> "+inp+" <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a )
    output = tokenizer.decode(output[0])
    return output


device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)

# print(tokenizer.decode(model.generate(**tokenizer("hey i was good at basketball but ",
#                          return_tensors="pt"))[0]))

chatData = ChatData("chat_data.json", tokenizer)
chatData =  DataLoader(chatData, batch_size=64)

model.train()

optim = Adam(model.parameters(), lr=1e-3)

print("training .... ")
train(chatData, model, optim)

print("infer from model : ")
while True:
  inp = input()
  print(infer(inp))

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 50261. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


<startofstring> I love iphone! i just bought new iphone! <bot>: Thats good for you, i'm not very into new tech <endofstring>
training .... 


  0%|                                                                                           | 0/12 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|██████▉                                                                            | 1/12 [00:28<05:15, 28.70s/it]

<startofstring> hello how are you <bot>: I am a good??? <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█████████████▊                                                                     | 2/12 [00:52<04:19, 25.90s/it]

<startofstring> hello how are you <bot>: Hi? <bot>: Hi? <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|████████████████████▊                                                              | 3/12 [01:16<03:45, 25.04s/it]

<startofstring> hello how are you <bot>: i am a huge gamer. i am a huge gamer. <endofstring> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███████████████████████████▋                                                       | 4/12 [01:40<03:15, 24.41s/it]

<startofstring> hello how are you <bot>: Hi <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 42%|██████████████████████████████████▌                                                | 5/12 [02:04<02:52, 24.58s/it]

<startofstring> hello how are you <bot>: hello <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████████████████████████████████████████▌                                         | 6/12 [02:31<02:30, 25.16s/it]

<startofstring> hello how are you <bot>: Hi, how are doing? <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 58%|████████████████████████████████████████████████▍                                  | 7/12 [02:57<02:06, 25.37s/it]

<startofstring> hello how are you <bot>: i am a huge gamer <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 67%|███████████████████████████████████████████████████████▎                           | 8/12 [03:23<01:42, 25.57s/it]

<startofstring> hello how are you <bot>: Hi <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 75%|██████████████████████████████████████████████████████████████▎                    | 9/12 [03:47<01:15, 25.15s/it]

<startofstring> hello how are you <bot>: i am a huge gamer <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 83%|████████████████████████████████████████████████████████████████████▎             | 10/12 [04:12<00:50, 25.07s/it]

<startofstring> hello how are you <bot>: hello how are you? <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 92%|███████████████████████████████████████████████████████████████████████████▏      | 11/12 [04:36<00:24, 24.93s/it]

<startofstring> hello how are you <bot>: Hi <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [05:00<00:00, 25.05s/it]


<startofstring> hello how are you <bot>: Hi, how are doing? <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
infer from model : 


 hey


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> hey <bot>: Hi, how are doing? <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


 hi


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> hi <bot>: Hi, how are doing? <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


 i am not feeling good


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> i am not feeling good <bot>: i am not feeling really. i am just sitting at home right


 how are you


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> how are you <bot>: i am a huge gamer, my mom is a very good person. <endofstring>


 what are you doing


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> what are you doing <bot>: i am not sure what that is. i am a very experienced person


 how are you


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> how are you <bot>: I am doing great, how are you? <endofstring> <pad> <pad> <pad> <pad> <pad>


 i am feeling sad


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> i am feeling sad <bot>: i am a huge gamer, my mom is a very good person.


 i am not feeling good


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> i am not feeling good <bot>: i am not feeling really. i am just sitting here. <endofstring>
