In [2]:
import pandas as pd
from pathlib import Path
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
DATA_DIR = Path("data", "ijcnlp_dailydialog", "train")
data = pd.read_csv(Path(DATA_DIR, "dialogues_train.txt"),  delimiter = "\n", names = ["dialogues"])

In [4]:
def seputterances(row):
    try:
        row = row.split("__eou__")
        row = row[:-1]
        return row
    except:
        return row

data["dialogues"] = data["dialogues"].apply(seputterances)

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")

  from .autonotebook import tqdm as notebook_tqdm


In [79]:
num_context = 3

utterance = []
history = []

for i in data.index:
    row = data["dialogues"][i]
    for idx  in range(len(row)):
        if idx >= 3:
            utterance.append(row[idx])
            counter = 1
            _history = ""
            
            for k in range(idx-3, idx, 1):
                if counter <= num_context:
                    _history = _history + row[k]
                    counter +=1
                else:
                    break
                _history = _history + tokenizer.eos_token
            history.append(_history)

        elif idx!=0 and idx<3:
            utterance.append(row[idx])
            _history = ""
            for k in range(idx):
                _history = _history + row[k]
                _history = _history + tokenizer.eos_token
            history.append(_history)
        else:
            continue

In [80]:
data["dialogues"].iloc[50]

['On , darn . The grocery store is closed . ',
 ' Yeah , this one closes at ten . We could drive to the 24 - hour store on sixth . ',
 ' Alright . We are out of everything . ',
 ' I wish the store close to us was open 24 - house a day . ',
 ' I know , our schedules are so weird . Sometimes , the little corner store is the only thing within walking distance that ’ s open when we get home . ',
 ' Yeah , and the prices there are very high . ',
 ' I know . Three dollars for milk . ']

In [81]:
utterance[51]

' That ’ s a nice area too . It ’ ll be a good investment for you . '

In [82]:
history[51]

' And it ’ s a bargaining . A house like this in river side costs double the price . <|endoftext|> Great , is it a two bedroom house ? <|endoftext|> No , it has three bedrooms and three beds , and has a living room with a twelve-foot ceiling . There ’ s a two-car garage . <|endoftext|>'

In [83]:
tokenizer.pad_token = tokenizer.eos_token
max_len = 64

In [85]:
import torch

input_ids = []

attention_masks = []

labels = []

for i in range(len(utterance)):
        
    encoded_utterance = tokenizer.encode_plus(utterance[i].lower() + tokenizer.eos_token, max_length = max_len, padding= "max_length", truncation = True, return_tensors = "pt")
    
    encoded_history = tokenizer.encode_plus(history[i].lower(), max_length = max_len, truncation = True, padding= "max_length", return_tensors = "pt")

    
    ids = torch.cat([encoded_utterance["input_ids"][0], encoded_history["input_ids"][0]], dim=0).reshape(1,max_len*2)
    mask = torch.cat([encoded_utterance["attention_mask"][0], encoded_history["attention_mask"][0]], dim=0).reshape(1,max_len*2)

    _label = torch.tensor([1 if element != 50256 else -100 for element in encoded_history["input_ids"][0]])

    label = torch.cat([torch.full((max_len,), -100), _label], dim = 0).reshape(1, max_len*2)

    input_ids.append(ids)
    attention_masks.append(mask)
    labels.append(label)


In [86]:
input_ids = torch.cat(input_ids, dim = 0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.cat(labels, dim=0)

In [87]:
from torch.utils.data import TensorDataset

dataset = TensorDataset(input_ids, attention_masks, labels)

In [88]:
num_batch = 8

In [89]:
from torch.utils.data import DataLoader, RandomSampler

dataloader = DataLoader(
            dataset,
            sampler = RandomSampler(dataset),
            batch_size = num_batch
        )

In [90]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

In [91]:
import random

SEED  = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.derterministic = True

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    device = torch.device("cuda")
    print(f"[INFO]: Working on GPU: {device}")
else:
    print("[INFO]: No GPU is available, using CPU instead")

[INFO]: Working on GPU: cuda


In [96]:


from transformers import get_scheduler
from tqdm.auto import tqdm

num_epochs = 3

mode = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_training_steps = int(num_epochs * len(dataloader))

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


### TRAINING
print(f"Training the model ...\n")
model.train()
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for i, batch in enumerate(dataloader):
        print(batch[0].shape, batch[1].shape, batch[2].shape)
        break
        # b_input_ids = batch[0].to(device)
        # b_attn_mask = batch[1].to(device)
        # labels = batch[2].to(device)

        # inputs = {"input_ids": b_input_ids, "attention_mask": b_attn_mask}

        # optimizer.zero_grad()
        
        # outputs = model(**inputs, labels = labels)
        # loss = outputs.loss

        # if i%100 == 0:
        #     print(f"Epoch: {epoch}, Batch: {i}, Loss: {loss}")
        

        # loss.backward()
        # optimizer.step()
        # lr_scheduler.step()

        # progress_bar.update(1)

Training the model ...



  0%|          | 0/28521 [00:05<?, ?it/s]

torch.Size([8, 128]) torch.Size([8, 128]) torch.Size([8, 128])
torch.Size([8, 128]) torch.Size([8, 128]) torch.Size([8, 128])
torch.Size([8, 128]) torch.Size([8, 128]) torch.Size([8, 128])





: 