In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/trl.git
!pip install flash-attn --no-build-isolation
!pip install quanto


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for trl (pyproject.toml) ... [?25l[?25hdone
Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m68.6 MB/s

In [2]:
# Mount GDrive - will prompt authentication
from google.colab import drive
drive.mount('/content/drive')

## Get HF tokenizer
with open("/content/drive/MyDrive/ColabNotebooks/ParentPalAI/data/hftoken.txt") as f:
    HF_TOKEN = f.read().strip()

BASE_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

Mounted at /content/drive


In [23]:
from huggingface_hub import login
login(HF_TOKEN)


# Load Training and Test Datasets

In [3]:
# Load training and unseen test sets as hugging face datasets
# Split train_df into train_df and validation_df
from datasets import load_dataset

DATA_FOLDER = '/content/drive/MyDrive/ColabNotebooks/ParentPalAI/data/'
TRAIN_VAL_DATA_FILE = 'dpo_dataset_dpo_labels'
TEST_DATA_FILE = 'dpo_dataset_test'

train_val_data = load_dataset('json', data_files=DATA_FOLDER+TRAIN_VAL_DATA_FILE+".jsonl", split='train')
train_val_data = train_val_data.train_test_split(train_size=768, seed=42) #768/24 = 32 steps for training
train_data = train_val_data['train']
val_data = train_val_data['test']
test_data = load_dataset('json', data_files=DATA_FOLDER+TEST_DATA_FILE+".jsonl", split='train')


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
print (f"Number of train samples: {len(train_data)}")
print (f"Number of val samples: {len(val_data)}")
print (f"Number of test samples: {len(test_data)}")


Number of train samples: 768
Number of val samples: 68
Number of test samples: 394


In [5]:
print(train_data[0])
print(test_data[0])

{'id': 1169, 'title': 'Navigating Guilt Over Academic Pressures', 'body': 'I pushed my 15-year-old to take advanced classes, thinking it would be good for their future, but now they seem stressed and unhappy. I feel so guilty for putting too much pressure on them. I’ve tried to reassure them that it’s okay to step back, but they seem stuck. How can I support them without adding to their stress?', 'topic': 'Managing parent guilt and burnout - Ages 15-16', 'chosen': "I understand your concern and the feelings of guilt you're experiencing. It's important to remember that while academic success is valuable, a child's emotional well-being is equally crucial. Here are some steps to help you navigate this situation:\n\n1. Acknowledge and Apologize: Acknowledge your role in the situation and apologize for any pressure you may have unintentionally put on your child. This will help them feel heard and understood.\n\n2. Encourage Open Communication: Create an environment where your child feels co

# Formatting to get a single simple prompt
We will use a standard prompt without explicit instructions to express empathy here.

In [6]:
MAX_WORDS = 250
MAX_OUTPUT_TOKENS = 600

PROMPT_TEMPLATE = """<s>[INST] You are a parenting assistant. Your job is to help parents make informed decisions and solve day-to-day challenges with their children.
Keep your answers under {MAX_WORDS} words and focused on the user’s specific question.

Question Title: {title}
Question Body: {body}
[/INST]"""

def build_prompt(example):
  example['prompt'] =  PROMPT_TEMPLATE.format(title=example['title'].strip(), body=example['body'].strip(), MAX_WORDS=MAX_WORDS)
  return example



In [7]:
train_data = train_data.map(build_prompt)
val_data = val_data.map(build_prompt)
test_data = test_data.map(build_prompt)


Map:   0%|          | 0/768 [00:00<?, ? examples/s]

Map:   0%|          | 0/68 [00:00<?, ? examples/s]

Map:   0%|          | 0/394 [00:00<?, ? examples/s]

In [8]:
print(train_data[0])
print(val_data[0])
print(test_data[0])

{'id': 1169, 'title': 'Navigating Guilt Over Academic Pressures', 'body': 'I pushed my 15-year-old to take advanced classes, thinking it would be good for their future, but now they seem stressed and unhappy. I feel so guilty for putting too much pressure on them. I’ve tried to reassure them that it’s okay to step back, but they seem stuck. How can I support them without adding to their stress?', 'topic': 'Managing parent guilt and burnout - Ages 15-16', 'chosen': "I understand your concern and the feelings of guilt you're experiencing. It's important to remember that while academic success is valuable, a child's emotional well-being is equally crucial. Here are some steps to help you navigate this situation:\n\n1. Acknowledge and Apologize: Acknowledge your role in the situation and apologize for any pressure you may have unintentionally put on your child. This will help them feel heard and understood.\n\n2. Encourage Open Communication: Create an environment where your child feels co

# Tokenizing the test dataset

In [9]:
# Set Up Tokenizer - end of the prompts must align so left padding
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_ID,
    use_fast=True,
    padding_side="left",
    add_bos_token=False,
    add_eos_token=False,
    token = HF_TOKEN
)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [10]:
# Get max_length by tokenizing and count max tokens in training data without padding
import numpy as np
def get_max_tokens(dataset, prompt_lbl):
  encoded = tokenizer(
      list(dataset[prompt_lbl]),
      return_tensors=None,
      padding=False,
      truncation=False,
  )
  n_tokens_max = np.max([len(x) for x in encoded['input_ids']])
  return n_tokens_max

n_tokens_train_max = get_max_tokens(train_data, 'prompt')
n_tokens_val_max = get_max_tokens(val_data, 'prompt')
n_tokens_test_max = get_max_tokens(test_data, 'prompt')

n_tokens_max = np.max([n_tokens_train_max, n_tokens_val_max, n_tokens_test_max])

print (f"Train Prompt Max Tokens: {n_tokens_train_max}")
print (f"Val Prompt Max Tokens: {n_tokens_val_max}")
print (f"Test Prompt Max Tokens: {n_tokens_test_max}")


Train Prompt Max Tokens: 182
Val Prompt Max Tokens: 160
Test Prompt Max Tokens: 201


In [11]:
# Tokenize prompts with padding and truncation
def tokenize_with_padding(dataset, prompt_lbl):
  encoded = tokenizer(
      list(dataset[prompt_lbl]),
      return_tensors="pt", # return pytorch tensors
      padding=True,
      truncation=True,
      max_length=n_tokens_max + 200 # some buffer added -- this is a ceiling not what tokenizer uses
  )
  return encoded


In [12]:
## Only tokenize test data for inference
test_tokenized = tokenize_with_padding(test_data, 'prompt')


In [13]:
# Ensure padding is correct!
## We need to left pad with </s> for mistral/ llama
## because <s> has semantic meaning and would confuse the model.
## There's no padding token in mistral/ llama

decoded = tokenizer.batch_decode(test_tokenized["input_ids"], skip_special_tokens=False)
print(decoded[277])


</s><s>[INST] You are a parenting assistant. Your job is to help parents make informed decisions and solve day-to-day challenges with their children.
Keep your answers under 250 words and focused on the user’s specific question.

Question Title: Navigating Independence Without Family Guidance
Question Body: I'm really proud of my 16-year-old son; he's growing into such a responsible young man. But with no grandparents or aunts and uncles around to offer different perspectives, I sometimes wonder if I'm giving him all the tools he needs for adulthood. He recently asked me about managing his own finances, and I realized how much I rely on my own experiences. I'm just not sure if that's enough. How can I ensure he's learning what he needs to succeed, especially when it's just the two of us figuring things out?
[/INST]


# Loading Mistral Instruct v0.3 without Quantization and Inferencing on Test Set

In [None]:
# LOAD THE BASE MODEL WITHOUT QUANTIZATION (should work and even be faster for A100)
import torch
# from transformers import AutoModelForCausalLM, BitsAndBytesConfig # for quantization
from transformers import AutoModelForCausalLM

torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

## version without quantization - running out of memory when dealing with large dataset
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2", # FA2 is fastest on A100
    token=HF_TOKEN # login to hugging face
)

model.config.pad_token_id = tokenizer.pad_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id


In [27]:
import time
import gc

## do inference on tokenized prompts
def free_cuda():
    gc.collect()
    if torch.cuda.is_available():
      torch.cuda.empty_cache()
      torch.cuda.ipc_collect()

def get_cleaned_answer(text):
  qasplit = text.split("[/INST]")
  a = qasplit[1]
  a = a.replace('</s>', '').strip()
  return a

def get_inferences(tokenized_data):
  with torch.inference_mode(): # better than torch.no_grad()
    inputs = {k: v.to(model.device) for k, v in tokenized_data.items()}
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_OUTPUT_TOKENS,
        do_sample=False, # use we use this instead of True + temperature/top_p to get more deterministic responses (greedy i.e. most likely next token is selected)
        eos_token_id=tokenizer.eos_token_id, # prevents model from generating tokens until max_tokens are reached
        pad_token_id=tokenizer.pad_token_id, # required for batch inference
        use_cache=True, # reduces computation of
        return_dict_in_generate=False
    )
    raw_output = tokenizer.batch_decode(outputs, skip_special_tokens=False)
    # release memory
    del outputs, inputs
    free_cuda()
    return raw_output

def batch_inferences(tokenized_data):
  chunk_size = 50 # small chunks to save memory
  all_outputs = []
  stt = time.time()
  n = len(tokenized_data["input_ids"])
  for i in range(0, n, chunk_size):
    print (f"Running for chunk starting at {i}")
    batch = {k: v[i:i+chunk_size] for k, v in tokenized_data.items()}
    out = get_inferences(batch)
    all_outputs.extend(out)
    # drop CPU refs too before GC
    del batch, out
    free_cuda()
  ttt = time.time() - stt
  print(f"Time taken for {n} samples: {ttt/60:.2f} min ({ttt:.2f} secs)")
  return all_outputs


In [None]:
# For inference always turn on eval model to avoid drop outs etc.
model.eval()

# Run inference on test
test_base_output = batch_inferences(test_tokenized)

In [None]:
# Clean output and add to test_data
test_base_output_clean = [get_cleaned_answer(x) for x in test_base_output]
test_data = test_data.add_column('base_output', test_base_output_clean)
test_data.to_json(DATA_FOLDER+TEST_DATA_FILE+'_output_base.jsonl', orient="records", lines=True)


# Re-Loading Mistral Instruct v0.3 with 4-bit Quantization and Preparing for DPO using QLoRA

In [None]:
## Delete model and flush memory -- check that memory usage is 0/ very low again!
del model
free_cuda()

In [14]:
## Reload model. This time with Quantization for QLoRA
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # loads base model in 4-bit precision
    bnb_4bit_use_double_quant=True, # double quantization saves VRAM
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2", # FA2 is fastest on A100
    token=HF_TOKEN # login to hugging face
)
## Disable KV cache for training
model.config.use_cache = False

## Enable gradient checkpointing - recomputes activations using backprop instead of storing them to save memory
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})

model.config.pad_token_id = tokenizer.pad_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [15]:
## LoRA Set up - what params are we going to learn?
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# r is the dimensionality of the low-rank decomposition.
# In LoRA, instead of learning a full dW (say 4096×4096), we factorize it into two skinny matrices: A (4096xr) and B (rx4096).

# alpha: a scaling factor applied to the LoRA update. The LoRA update is dW = (alpha / r) * BA(x).
# alpha acts like a multiplier for the LoRA updates - a way to control the effective learning rate of the adapter layers.

# Since dataset is small, we use narrower LoRA with smaller values of r and lora_alpha
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
    bias="none", # training bias rarely helps
    lora_dropout=0.15, # regularization to prevent overfitting
    task_type="CAUSAL_LM",
)

def print_train_params(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


## Prep model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, config)

# Train mode and ensure inputs require grad
model.train()
model.enable_input_require_grads()

# Print the trainable params
print_train_params(model) # ~1.12% params are gonna be trained (small number)


trainable params: 6815744 || all params: 3765178368 || trainable%: 0.18102048120552677


In [16]:
# If more than 1 GPU, ok to set the model to train in parallel
if torch.cuda.device_count() > 1:
    model.is_parallelizable = True
    model.model_parallel = True

In [17]:
## DPO Set up
import transformers
from trl.trainer.utils import DPODataCollatorWithPadding
from trl import DPOConfig, DPOTrainer

run_name = "dpo1"
output_dir = "/content/drive/MyDrive/ColabNotebooks/ParentPalAI/model/" + run_name

## Samples per step = per_device_train_batch_size * gradient_accumulation_steps * num_gpus (1) = 24
## 790 samples -->
args = DPOConfig(
    output_dir=output_dir,
    # ---- batches & schedule ----
    per_device_train_batch_size=6,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    warmup_ratio=0.03,                  # first 3% of training steps are a warmup phase where learning rate ramps from 0 → 1e-4. Prevents instability at the start.
    learning_rate=1e-4,
    lr_scheduler_type="cosine",         # Learning rate follows a cosine decay curve after warmup
    # ---- precision / optimizer ----
    bf16=True,
    fp16=False,
    optim="paged_adamw_8bit",           # QLoRA
    # ---- logging / eval / save ----
    logging_steps=4,
    logging_dir=output_dir+"/logs",
    eval_strategy="steps",
    eval_steps=16,
    save_strategy="steps",
    save_steps=16,
    save_total_limit=5,
    load_best_model_at_end=False,            # We want to save the final checkpoint and load the checkpoint we want to use later
    metric_for_best_model="eval_loss",       # With eval enabled, loss is treated as eval_loss
    greater_is_better=False,
    # ---- misc ----
    gradient_checkpointing=True,
    report_to=["tensorboard"],
    ## DPO CONFIG
    ## beta controls how “sharp” that preference signal is
    ### Low beta (e.g. 0.05): softer preference signal; model changes weights less aggressively
    ### High beta (e.g. 0.5 or 1.0): stronger penalty/reward; model may overfit or destabilize on small datasets
    beta=float(0.1),
    max_prompt_length = int(n_tokens_max + 200), # 201+200 = 401
    max_length=int(n_tokens_max + 200+ MAX_OUTPUT_TOKENS), # prompt + completion
)

## Create DPO trainer
dpo_trainer = DPOTrainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=val_data,
    processing_class=tokenizer,
)


Extracting prompt in train dataset:   0%|          | 0/768 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/768 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/768 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/68 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/68 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/68 [00:00<?, ? examples/s]

In [18]:
## Ensure there are trainable params
print_train_params(model)

# some params must be trainable (>0)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

## Clearly there are trainable params - unsure why that warning is there - likely benign

trainable params: 6815744 || all params: 3765178368 || trainable%: 0.18102048120552677
6815744


In [19]:
## Fine-tune model with DPO
dpo_trainer.train()

Casting fp32 inputs back to torch.bfloat16 for flash-attn compatibility.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
16,0.7102,0.659065,-0.68057,-0.938611,0.602941,0.258041,-127.845993,-125.314842,-3.299192,-3.323323
32,0.5801,0.650914,-1.09527,-1.477839,0.632353,0.382569,-131.992996,-130.707108,-3.315586,-3.341519
48,0.05,0.936807,-2.943828,-3.173482,0.485294,0.229654,-150.478592,-147.663544,-3.170234,-3.203618
64,0.1049,0.937257,-2.983942,-3.2398,0.5,0.255858,-150.879715,-148.326736,-3.150211,-3.18454



Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.3 is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.3.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.3 is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.3.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.3 is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the loo

TrainOutput(global_step=64, training_loss=0.3819902236573398, metrics={'train_runtime': 870.8882, 'train_samples_per_second': 1.764, 'train_steps_per_second': 0.073, 'total_flos': 0.0, 'train_loss': 0.3819902236573398, 'epoch': 2.0})

In [24]:
## Save artifacts
dpo_trainer.model.save_pretrained(output_dir+"/final_checkpoint", token=HF_TOKEN)
tokenizer.save_pretrained(output_dir+"/final_checkpoint", token=HF_TOKEN)

('/content/drive/MyDrive/ColabNotebooks/ParentPalAI/model/dpo1/final_checkpoint/tokenizer_config.json',
 '/content/drive/MyDrive/ColabNotebooks/ParentPalAI/model/dpo1/final_checkpoint/special_tokens_map.json',
 '/content/drive/MyDrive/ColabNotebooks/ParentPalAI/model/dpo1/final_checkpoint/chat_template.jinja',
 '/content/drive/MyDrive/ColabNotebooks/ParentPalAI/model/dpo1/final_checkpoint/tokenizer.model',
 '/content/drive/MyDrive/ColabNotebooks/ParentPalAI/model/dpo1/final_checkpoint/added_tokens.json',
 '/content/drive/MyDrive/ColabNotebooks/ParentPalAI/model/dpo1/final_checkpoint/tokenizer.json')

In [25]:
## Print best checkpoint
print (f"BEST CHECKPOINT: {dpo_trainer.state.best_model_checkpoint}")

## ONE EPOCH IS BEST. We see overfitting the second epoch

BEST CHECKPOINT: /content/drive/MyDrive/ColabNotebooks/ParentPalAI/model/dpo1/checkpoint-32


# Save Model and Flush Memory

In [None]:
## Flush memory
del dpo_trainer, model
free_cuda()

# Reload Best Model and Perform Inference (without Quantization)

In [32]:
from peft import PeftModel

## Reload model
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2", # FA2 is fastest on A100
    token=HF_TOKEN # login to hugging face
)

tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_ID,
    use_fast=True,
    padding_side="left",
    add_bos_token=False,
    add_eos_token=False,
    token = HF_TOKEN
)

tokenizer.pad_token = tokenizer.eos_token

## Load adapter weights for the best fine tuned model
model = PeftModel.from_pretrained(
    model,
    output_dir+"/checkpoint-32"
)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [33]:
# Run inference on test
test_dpo_output = batch_inferences(test_tokenized)


Running for chunk starting at 0
Running for chunk starting at 50
Running for chunk starting at 100
Running for chunk starting at 150
Running for chunk starting at 200
Running for chunk starting at 250
Running for chunk starting at 300
Running for chunk starting at 350
Time taken for 394 samples: 5.85 min (350.98 secs)


In [34]:
# Clean output and add to test_data
test_dpo_output_clean = [get_cleaned_answer(x) for x in test_dpo_output]
test_data = test_data.add_column('dpo_output', test_dpo_output_clean)
test_data.to_json(DATA_FOLDER+TEST_DATA_FILE+'_output_dpo.jsonl', orient="records", lines=True)


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1223731

# Upload Model to HF

In [None]:
# Merge base model with the adapter
# model = model.merge_and_unload()

# # Save model and tokenizer
# model.save_pretrained(output_dir+"/parentpalai", token=HF_TOKEN)
# tokenizer.save_pretrained(output_dir+"/parentpalai", token=HF_TOKEN)

# # Push them to the HF Hub
# model.push_to_hub("parentpalai", use_temp_dir=False, token=HF_TOKEN)
# tokenizer.push_to_hub("parentpalai", use_temp_dir=False, token=HF_TOKEN)

In [35]:
from google.colab import runtime
runtime.unassign()