In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from config import Config
from peft import  LoraConfig, get_peft_model
from data_module import convert_raw_data_to_model_qa
from collators import custom_gd_collator_forget
from forget_trainer import GradDiffTrainer
from utils import find_all_linear_names
from accelerate import Accelerator
import pandas as pd
import numpy as np
from torch.utils.data import Dataset

In [2]:
cfg = Config()

accelerator = Accelerator()

In [3]:
forget = pd.read_csv(cfg.forget_path) #cfg.forget_path
retain = pd.read_csv(cfg.retain_path) #cfg.retain_path

In [4]:
def cyclic_pair_and_concat(forget: pd.DataFrame,
                           retain: pd.DataFrame,
                           on: str = 'title',
                           suffixes=('_forget', '_retain')) -> pd.DataFrame:
    """
    For each unique value in `on`, take the two sub‐DataFrames:
      fg = forget[forget[on] == value]
      rt = retain[retain[on] == value]
    and then:
      - if len(fg) < len(rt): cycle fg to match len(rt), pair fg_cycle[i] with rt.iloc[i]
      - else:                cycle rt to match len(fg), pair fg.iloc[i] with rt_cycle[i]
    Finally, concat side‐by‐side (axis=1), using suffixes to keep columns distinct.
    Returns the concatenated DataFrame for all titles.
    """
    out_dfs = []
    titles = set(forget[on]).union(retain[on])

    for t in titles:
        fg = forget[forget[on] == t].reset_index(drop=True)
        rt = retain[retain[on] == t].reset_index(drop=True)
        if fg.empty or rt.empty:
            # if one side is empty, you can choose to skip or just take the non‐empty side
            continue

        n_fg, n_rt = len(fg), len(rt)
        if n_fg < n_rt:
            # cycle fg
            idx_fg = [i % n_fg for i in range(n_rt)]
            fg_cycle = fg.iloc[idx_fg].reset_index(drop=True)
            rt_cycle = rt
        else:
            # cycle rt
            idx_rt = [i % n_rt for i in range(n_fg)]
            fg_cycle = fg
            rt_cycle = rt.iloc[idx_rt].reset_index(drop=True)

        # now both have same length
        fg_cycle = fg_cycle.add_suffix(suffixes[0])
        rt_cycle = rt_cycle.add_suffix(suffixes[1])

        # make sure the key column isn't duplicated/SUFFIXed twice
        # so we’ll take title_forget and then rename it back to title:
        fg_cycle = fg_cycle.rename(columns={f"{on}{suffixes[0]}": on})
        
        # concat side by side
        paired = pd.concat([fg_cycle, rt_cycle.drop(columns=[f"{on}{suffixes[1]}"])], axis=1)
        out_dfs.append(paired)

    return pd.concat(out_dfs, ignore_index=True)

In [5]:
#forget.drop(columns = ['idk'], inplace= True) # dont do this for gd title
general_retain = retain.loc[retain['type'] == 'general']
other_retain = retain.loc[retain['type'] != 'general']

In [6]:
new_df = cyclic_pair_and_concat(forget, other_retain)

In [7]:
new_df.shape

(1508, 7)

In [8]:
rng = np.random.RandomState(42)

In [9]:
fg_pool = new_df[['question_forget', 'answer_forget']]

In [10]:
extras = []

for _, gr in general_retain.reset_index(drop=True).iterrows():

    src = fg_pool.sample(n=1, random_state=rng).iloc[0]

    extras.append({
        'question_forget': src['question_forget'],
        'answer_forget': src['answer_forget'],
        'question_retain': gr['question'],
        'answer_retain': gr['answer']
    })

In [11]:
extra_df = pd.DataFrame(extras)

final_df = pd.concat([new_df.reset_index(drop=True), extra_df], ignore_index=True)

In [12]:
final_df.head()

Unnamed: 0,title,question_forget,answer_forget,idk_forget,question_retain,answer_retain,type_retain
0,Jorge Semprún,When was Jorge Semprún Maura born?,10 December 1923,I haven't been briefed on that topic.,When was Las ratas published?,1962,domain
1,Jorge Semprún,In what country was Jorge Semprún Maura born?,Spain,I don't have that information.,How many members does the Académie Goncourt co...,Ten members.,entity
2,Jorge Semprún,What role did Jorge Semprún Maura hold in Spai...,Minister of Culture,I'm unaware of that detail.,Where and when is the awards ceremony for the ...,The awards ceremony takes place during the Day...,entity
3,Jorge Semprún,What is the name of the concentration camp whe...,Buchenwald,I lack the specifics on that matter.,When was Mario Vargas Llosa born?,28 March 1936,domain
4,Jorge Semprún,Where did Jorge Semprún live most of his life?,France,"That's a good question, but I don't have the a...",What was the name of Mario Vargas Llosa's firs...,The Time of the Hero,domain


In [None]:
final_df.to_csv('title_df.csv')

### tokenization

In [None]:
class DualTitleDataset(Dataset):
    """
    Dataset that returns pre-paired forget/retain rows.
    
    Expects a DataFrame with columns:
      question_forget, answer_forget, question_retain, answer_retain
    (plus any other columns you don’t care about).
    """
    def __init__(
        self,
        paired_df,
        tokenizer,
        max_length,
        question_key: str = "question",
        answer_key: str = "answer"
    ):
        # e.g. paired_df.columns = [
        #   'title', 'question_forget', 'answer_forget',
        #   'question_retain', 'answer_retain', … ]
        self.df = paired_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.qk = question_key
        self.ak = answer_key

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # pull out the two sides
        q_forget = row[f"{self.qk}_forget"]
        a_forget = row[f"{self.ak}_forget"]
        q_retain = row[f"{self.qk}_retain"]
        a_retain = row[f"{self.ak}_retain"]
        
        # convert as before
        forget_data = convert_raw_data_to_model_qa(
            self.tokenizer, self.max_length, q_forget, a_forget
        )
        retain_data = convert_raw_data_to_model_qa(
            self.tokenizer, self.max_length, q_retain, a_retain
        )
        
        return (forget_data, retain_data)


In [38]:
print(f"\nLoading the Tokenizer {cfg.model_id}")
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id, token = cfg.access_token)
tokenizer.pad_token = tokenizer.eos_token


Loading the Tokenizer praveensonu/llama_3_1_8b_finetuned


In [39]:
dataset = DualTitleDataset(final_df, tokenizer, 256)

### model

In [41]:
print(f"\nLoading the Model {cfg.model_id}")
model = AutoModelForCausalLM.from_pretrained(cfg.model_id, 
                                             torch_dtype = torch.bfloat16, 
                                             token=cfg.access_token,)


Loading the Model praveensonu/llama_3_1_8b_finetuned


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [42]:
config = LoraConfig(
        r = cfg.LoRA_r,
        lora_alpha = cfg.LoRA_alpha,
        lora_dropout= cfg.LoRA_dropout,
        target_modules = find_all_linear_names(model),
        bias = 'none',
        task_type = 'CAUSAL_LM',
    )


In [43]:
model = get_peft_model(model, config)
model.print_trainable_parameters()
#model.generation_config.do_sample = True
model.config.use_cache = False

trainable params: 20,971,520 || all params: 8,051,232,768 || trainable%: 0.2605


### finetuning

In [50]:
training_args = TrainingArguments(
        output_dir = cfg.save_dir,
        overwrite_output_dir= True,
        learning_rate = cfg.lr,
        per_device_train_batch_size= 4, 
        num_train_epochs= 10,
        weight_decay = cfg.weight_decay,
        logging_dir = f'{cfg.save_dir}/logs',
        eval_strategy= 'no',
        label_names = ['labels'],
        bf16 = True,
        gradient_accumulation_steps= 2,
        #save_only_model=True,
        report_to = 'wandb',
    )

In [51]:
trainer = GradDiffTrainer(
        model = model,
        args = training_args,
        train_dataset = dataset,
        tokenizer = tokenizer,
        data_collator = custom_gd_collator_forget,
)

  trainer = GradDiffTrainer(


In [52]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.38 GiB of which 19.38 MiB is free. Including non-PyTorch memory, this process has 39.35 GiB memory in use. Of the allocated memory 38.57 GiB is allocated by PyTorch, and 289.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [48]:
cfg.save_dir = 'outputs/title_gd_model'

In [None]:
print(f'\nForget LoRA adapter saved at {cfg.save_dir}')
model.save_pretrained(cfg.save_dir)
tokenizer.save_pretrained(cfg.save_dir)