In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from config import Config
from peft import  LoraConfig, get_peft_model
from data_module import DualDataset
from collators import custom_gd_collator_forget
from utils import find_all_linear_names
from forget_trainer import GradDiffTrainer
from accelerate import Accelerator
import pandas as pd


In [3]:
cfg = Config()

accelerator = Accelerator()

In [4]:
cfg.save_dir = 'outputs/wpu_cyclic_grad_diff'

In [5]:
print('loading the paths to forget, retain and test set')
forget = pd.read_csv(cfg.forget_path) #cfg.forget_path
retain = pd.read_csv(cfg.retain_path) #cfg.retain_path

loading the paths to forget, retain and test set


In [6]:
print(f"\nLoading the Tokenizer {cfg.model_id}")
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id, token = cfg.access_token)
tokenizer.pad_token = tokenizer.eos_token


Loading the Tokenizer praveensonu/llama_3_1_8b_finetuned


In [7]:
print(f"\nLoading the Model {cfg.model_id}")
model = AutoModelForCausalLM.from_pretrained(cfg.model_id, 
                                             torch_dtype = torch.bfloat16, 
                                             token=cfg.access_token,)


Loading the Model praveensonu/llama_3_1_8b_finetuned


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
config = LoraConfig(
        r = cfg.LoRA_r,
        lora_alpha = cfg.LoRA_alpha,
        lora_dropout= cfg.LoRA_dropout,
        target_modules = find_all_linear_names(model),
        bias = 'none',
        task_type = 'CAUSAL_LM',
    )


In [9]:
model = get_peft_model(model, config)
model.print_trainable_parameters()
#model.generation_config.do_sample = True
model.config.use_cache = False

trainable params: 20,971,520 || all params: 8,051,232,768 || trainable%: 0.2605


In [10]:
train_dataset =  DualDataset(
    forget_data = forget,
    retain_data = retain,
    tokenizer = tokenizer,
    max_length = 256)

In [11]:
print(len(train_dataset))

1801


In [12]:
training_args = TrainingArguments(
        output_dir = cfg.save_dir,
        overwrite_output_dir= True,
        learning_rate = cfg.lr,
        per_device_train_batch_size= 4, 
        num_train_epochs= 10,
        weight_decay = cfg.weight_decay,
        logging_dir = f'{cfg.save_dir}/logs',
        eval_strategy= 'no',
        label_names = ['labels'],
        bf16 = True,
        gradient_accumulation_steps= 2,
        #save_only_model=True,
        report_to = 'wandb',
    )

In [14]:
trainer = GradDiffTrainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        tokenizer = tokenizer,
        data_collator = custom_gd_collator_forget,
    )


  trainer = GradDiffTrainer(


In [15]:
trainer.train()



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpraveenbushipaka942[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
500,-177.0745
1000,-292.8138
1500,-312.9245
2000,-344.9325


TrainOutput(global_step=2250, training_loss=-291.0765625, metrics={'train_runtime': 3059.4876, 'train_samples_per_second': 5.887, 'train_steps_per_second': 0.735, 'total_flos': 0.0, 'train_loss': -291.0765625, 'epoch': 9.977827050997783})

In [16]:
print(f'\nForget LoRA adapter saved at {cfg.save_dir}')
model.save_pretrained(cfg.save_dir)
tokenizer.save_pretrained(cfg.save_dir)


Forget LoRA adapter saved at outputs/wpu_cyclic_grad_diff


('outputs/wpu_cyclic_grad_diff/tokenizer_config.json',
 'outputs/wpu_cyclic_grad_diff/special_tokens_map.json',
 'outputs/wpu_cyclic_grad_diff/tokenizer.json')