In [35]:
import os, sys
import ipdb  # for debugging
from tqdm import tqdm
from datetime import datetime
#import platform, shutil  # detect platform type
import requests, zipfile, io
#import math
from typing import List, Dict, Union
import numpy as np

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
#from torch.nn import DataParallel

import sentencepiece as spm  # For the tokenizer
import transformers 
from datasets import load_dataset, load_from_disk

# These lines improve performance for Ampere Architecture (e.g: A100s)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
# Empty GPU cache memory
torch.cuda.empty_cache()

### 1-) Parameters

In [36]:
batch_size = 1
epochs = 3 # number of iterations
lr = 6e-1
lr_warmup_steps = 100
context = 1024
alpha = 0.5 # weight for the ORPO odds ratio
prompt_max_size = 512 # limit for the prompt part of the interaction, answer length of our model
complie = False 
dtype = torch.bfloat16
log_iters = 50

# HYPERPARAMETERS
dropout = 0.0
grad_clip = 1.0
weight_decay = 0.0

# DEVICE - Set device to GPU or CPU (use GPU definitely)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device: You will be using: ", device)

#Dataset paths
dataset_path = "data/orpo_dataset"
dataset_name = "mlabonne/orpo-dpo-mix-40k"
tokenizer_path = "tokenizers/tok16384"
checkpoint_dir = "models/"

device: You will be using:  cuda


### 2-) Logging 

In [37]:
project_name = "align_test"
wandb_log = True
wandb_project = project_name
# wandb_run_name = "align_test-run"
wandb_run_name = "align_test-run" + datetime.now().strftime("%Y%m%d-%H%M%S")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)
    

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

### 3-) Load Tokenizer if it exists, otherwise tokenize from scratch

In [38]:
# load tokenizer with Huggingface format
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)

# set interaction template
tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

# Make padding token equal to the end of sentence token (which has id of 2 in this case)
tokenizer.pad_token = tokenizer.eos_token

if os.path.exists(dataset_path):
    dataset = load_from_disk(dataset_path)
# IF DATASET DOES NOT EXIST, DO BELOW (we learn do it by ourselves)
else:
    # download and load dataset
    print("Downloading and filtering dataset...")
    dataset = load_dataset(dataset_name)
    dataset = dataset.filter(lambda r: r["source"] != "toxic-dpo-v0.2")
    
    print(dataset.column_names)
    
    # Avoid entries longer than prompt_max_size (512). 
    # Because we want prompt + answer to fit the total context (1024)
    def filter_dataset(examples):
        # examples['chosen'][:-1] picks the prompt minus the answer
        prompt_length = tokenizer.apply_chat_template(
            examples["chosen"][:-1],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).size(-1)
        # Preserve only samples that have a prompt smaller than prompt_max_size
        if prompt_length < prompt_max_size:
            return True
        else:
            return False
    
    # process the dataset
    
    print("Processing dataset...")
    
    def process_dataset(examples):
        prompt = [tokenizer.apply_chat_template(item[:-1], tokenize=False, add_generation_prompt=True)for item in examples["chosen"]]
        chosen =[tokenizer.apply_chat_template(item , tokenize=False) for item in examples["chosen"]]
        rejected =[tokenizer.apply_chat_template(item , tokenize=False) for item in examples["rejected"]]
        #ipdb.set_trace()
        # Let's tokenize above
        inputs = tokenizer(prompt, max_length=context, padding="max_length", truncation=True, return_tensors="pt")
        positive_labels = tokenizer(chosen, max_length=context, padding="max_length", truncation=True, return_tensors="pt")
        negative_labels = tokenizer(rejected, max_length=context, padding="max_length", truncation=True, return_tensors="pt")
        
        inputs["positive_input_ids"] = positive_labels["input_ids"]
        inputs["positive_attention_mask"] = positive_labels["attention_mask"]
        
        inputs["negative_input_ids"] = negative_labels["input_ids"]
        inputs["negative_attention_mask"] = negative_labels["attention_mask"]
        
        return inputs
    
    
    
    dataset = dataset.filter(filter_dataset)
    
    dataset = dataset.map(process_dataset, batched=True, num_proc=1, remove_columns=dataset.column_names['train'])
        
    dataset.save_to_disk(dataset_path)

In [39]:
dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'positive_input_ids', 'positive_attention_mask', 'negative_input_ids', 'negative_attention_mask'],
    num_rows: 38550
})