In [18]:
import pandas as pd
from pathlib import Path
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [19]:
DATA_DIR = Path("data", "ijcnlp_dailydialog", "train")
data = pd.read_csv(Path(DATA_DIR, "dialogues_train.txt"),  delimiter = "\n", names = ["dialogues"])

In [20]:
def seputterances(row):
    try:
        row = row.split("__eou__")
        row = row[:-1]
        return row
    except:
        return row

data["dialogues"] = data["dialogues"].apply(seputterances)

In [21]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")

In [22]:
num_context = 3

utterance = []
history = []

for i in data.index:

    row = data["dialogues"][i]

    for idx  in range(len(row)):

        if idx != 0:
            
            utterance.append(row[idx])

            counter = 1
            _history = ""
            
            for k in range(idx-1, -1, -1):
                if counter <= num_context:
                    _history = _history + row[k]
                    counter +=1
                else:
                    break
                _history = _history + tokenizer.eos_token
            history.append(_history)
        else:
            continue

In [23]:
tokenizer.pad_token = tokenizer.eos_token
max_len = 32

In [24]:
import torch

input_ids = []

attention_masks = []

labels = []

for i in range(len(utterance)):
    
    encoded_utterance = tokenizer.encode_plus(utterance[i].lower() + tokenizer.eos_token, max_length = max_len, padding= "max_length", truncation = True, return_tensors = "pt")
    
    encoded_history = tokenizer.encode_plus(history[i].lower(), max_length = max_len, truncation = True, padding= "max_length", return_tensors = "pt")
    ids = torch.cat([encoded_utterance["input_ids"][0], encoded_history["input_ids"][0]], dim=0).reshape(1,max_len*2)
    mask = torch.cat([encoded_utterance["attention_mask"][0], encoded_history["attention_mask"][0]], dim=0).reshape(1,max_len*2)
    label = torch.cat([torch.full((max_len,), -100), torch.full((max_len,), 1)], dim = 0).reshape(1, max_len*2)

    input_ids.append(ids)

    attention_masks.append(mask)
    labels.append(label)

In [25]:
input_ids = torch.cat(input_ids, dim = 0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.cat(labels, dim=0)

In [26]:
from torch.utils.data import TensorDataset

dataset = TensorDataset(input_ids, attention_masks, labels)

In [42]:
num_batch = 8

In [39]:
from torch.utils.data import DataLoader, RandomSampler

dataloader = DataLoader(
            dataset,
            sampler = RandomSampler(dataset),
            batch_size = num_batch
        )

In [28]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

In [43]:
import random

SEED  = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.derterministic = True

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    device = torch.device("cuda")
    print(f"[INFO]: Working on GPU: {device}")
else:
    print("[INFO]: No GPU is available, using CPU instead")

[INFO]: Working on GPU: cuda


In [44]:


from transformers import get_scheduler
from tqdm.auto import tqdm

clip = 2.0
num_epochs = 3

mode = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_training_steps = int(num_epochs * len(dataloader))

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


### TRAINING
print(f"Training the model ...\n")
model.train()
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for i, batch in enumerate(dataloader):
        b_input_ids = batch[0].to(device)
        b_attn_mask = batch[1].to(device)
        labels = batch[2].to(device)

        inputs = {"input_ids": b_input_ids, "attention_mask": b_attn_mask}

        optimizer.zero_grad()
        outputs = model(**inputs, labels = labels)
        loss = outputs.loss

        if i%100 == 0:
            print(f"Epoch: {epoch}, Batch: {i}, Loss: {loss}")
        

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        progress_bar.update(1)

Training the model ...



  0%|          | 0/7131 [00:10<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 15.90 GiB total capacity; 15.04 GiB already allocated; 3.75 MiB free; 15.13 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF