In [2]:
import pandas as pd
from pathlib import Path
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
DATA_DIR = Path("data", "ijcnlp_dailydialog", "train")
data = pd.read_csv(Path(DATA_DIR, "dialogues_train.txt"),  delimiter = "\n", names = ["dialogues"])

In [4]:
def seputterances(row):
    try:
        row = row.split("__eou__")
        row = row[:-1]
        return row
    except:
        return row

data["dialogues"] = data["dialogues"].apply(seputterances)

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
num_context = 3

utterance = []
history = []

for i in data.index:

    row = data["dialogues"][i]

    for idx  in range(len(row)):

        if idx != 0:
            
            utterance.append(row[idx])

            counter = 1
            _history = ""
            
            for k in range(idx-1, -1, -1):
                if counter <= num_context:
                    _history = _history + row[k]
                    counter +=1
                else:
                    break
                _history = _history + tokenizer.eos_token
            history.append(_history)
        else:
            continue

In [7]:
tokenizer.pad_token = tokenizer.eos_token
max_len = 32

In [10]:
import torch

input_ids = []

attention_masks = []

labels = []

for i in range(len(utterance)):
    
    encoded_utterance = tokenizer.encode_plus(utterance[i].lower() + tokenizer.eos_token, max_length = max_len, padding= "max_length", truncation = True, return_tensors = "pt")
    
    encoded_history = tokenizer.encode_plus(history[i].lower(), max_length = max_len, truncation = True, padding= "max_length", return_tensors = "pt")
    ids = torch.cat([encoded_utterance["input_ids"][0], encoded_history["input_ids"][0]], dim=0).reshape(1,max_len*2)
    mask = torch.cat([encoded_utterance["attention_mask"][0], encoded_history["attention_mask"][0]], dim=0).reshape(1,max_len*2)
    label = torch.cat([torch.full((max_len,), -100), torch.full((max_len,), 1)], dim = 0).reshape(1, max_len*2)

    input_ids.append(ids)

    attention_masks.append(mask)
    labels.append(label)

In [11]:
input_ids = torch.cat(input_ids, dim = 0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.cat(labels, dim=0)

In [12]:
from torch.utils.data import TensorDataset

dataset = TensorDataset(input_ids, attention_masks, labels)

In [13]:
from torch.utils.data import DataLoader, RandomSampler

dataloader = DataLoader(
            dataset,
            sampler = RandomSampler(dataset),
            batch_size = 4
        )

In [15]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

In [16]:
import random

SEED  = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.derterministic = True

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    device = torch.device("cuda")
    print(f"[INFO]: Working on GPU: {device}")
else:
    print("[INFO]: No GPU is available, using CPU instead")

In [17]:


from transformers import get_scheduler
from tqdm.auto import tqdm

clip = 2.0
num_epochs = 3


optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_training_steps = int(num_epochs * len(dataloader))

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


### TRAINING
print(f"Training the model ...\n")
model.train()
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    print("Epoch {} --------------------------".format(epoch+1))

    for i, batch in enumerate(dataloader):
        b_input_ids = batch[0].to(device)
        b_attn_mask = batch[1].to(device)
        labels = batch[2].to(device)

        inputs = {"input_ids": b_input_ids, "attention_mask": b_attn_mask}

        optimizer.zero_grad()
        outputs = model(**inputs, labels = labels)
        loss = outputs.loss

        print(f"Loss: {loss}")
        

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        progress_bar.update(1)

CausalLMOutputWithCrossAttentions(loss=tensor(24.0285, grad_fn=<NllLossBackward0>), logits=tensor([[[ -8.5079, -13.8944, -15.6137,  ..., -13.1127, -13.1199,  -5.0062],
         [ -2.6934, -11.7862, -11.8419,  ...,  -4.7892,  -6.8098,   4.6434],
         [ -8.5184, -14.1671, -15.1450,  ...,  -9.9843, -10.6863,  -0.2564],
         ...,
         [  3.5732,  -8.8661,  -8.1795,  ...,  -6.3937,  -5.3465,   3.1450],
         [  3.5628,  -8.8638,  -8.1741,  ...,  -6.4014,  -5.3478,   3.1328],
         [  3.5748,  -8.8676,  -8.1800,  ...,  -6.3924,  -5.3482,   3.1382]],

        [[ -8.4468, -13.9228, -15.6301,  ..., -13.1007, -13.1457,  -5.0144],
         [ -1.5172,  -5.6393,  -6.5056,  ...,  -6.2017,  -3.8873,   4.0253],
         [ -1.4311, -10.8234, -11.2931,  ..., -10.1325,  -6.8797,   5.9903],
         ...,
         [  5.1737,  -9.0048,  -9.2289,  ...,  -6.6039,  -3.3768,   5.0398],
         [  5.1699,  -9.0141,  -9.2557,  ...,  -6.6256,  -3.3973,   5.0029],
         [  5.1767,  -9.0121,  -