In [1]:
!nvidia-smi

Mon Aug  5 19:01:38 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:4E:00.0 Off |                    0 |
| N/A   34C    P0             85W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import os
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from trl import SFTTrainer

In [3]:
batch_size = 40
num_workers = os.cpu_count()
# max_steps = 3000
bf16 = True
fp16 = False
# gradient_accumulation_steps = 2
context_length = 1024
logging_steps = 500
save_steps = 500
learning_rate = 2e-4
model_name = './custom_gpt2'
out_dir = 'outputs/gpt2_sft_instruction'

In [4]:
dataset_alpaca = load_dataset('tatsu-lab/alpaca')
print(dataset_alpaca)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})


In [5]:
hh_rlhf = load_dataset('Anthropic/hh-rlhf')

In [6]:
print(hh_rlhf)

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 160800
    })
    test: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 8552
    })
})


In [7]:
def preprocess_function(example):
    # alpaca
    if 'instruction' in example:
        text = f"Human:\n{example['instruction']}"
        if len(example['input'].strip())>0: 
            text+=f"\ninput -\n{example['input']}"
        text+=f"\n\nAssistant:\n{example['output']}"
    else:
        text = example['chosen'].strip().replace("Human:", "<|endoftext|>Human:").removeprefix('<|endoftext|>').removesuffix('<|endoftext|>')

        
    
    return {"input_text":text}

In [8]:
full_dataset_alpaca = dataset_alpaca.map(preprocess_function).remove_columns(['instruction', 'input', 'output', 'text'])['train'].train_test_split(test_size=0.05, shuffle=True, seed=42)

In [9]:
full_dataset_alpaca

DatasetDict({
    train: Dataset({
        features: ['input_text'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['input_text'],
        num_rows: 2601
    })
})

In [10]:
dataset_train_alpaca = full_dataset_alpaca['train']
dataset_valid_alpaca = full_dataset_alpaca['test']

print(dataset_train_alpaca)
print(dataset_valid_alpaca)

Dataset({
    features: ['input_text'],
    num_rows: 49401
})
Dataset({
    features: ['input_text'],
    num_rows: 2601
})


In [11]:
half_dataset_hh_rlhf = hh_rlhf['train'].train_test_split(test_size=0.5, shuffle=True, seed=42)['train'].map(preprocess_function).remove_columns(['chosen', 'rejected']).train_test_split(test_size=0.05, shuffle=True, seed=42)
half_dataset_hh_rlhf

DatasetDict({
    train: Dataset({
        features: ['input_text'],
        num_rows: 76380
    })
    test: Dataset({
        features: ['input_text'],
        num_rows: 4020
    })
})

In [12]:
dataset_train_hh_rlhf = half_dataset_hh_rlhf['train']
dataset_valid_hh_rlhf = half_dataset_hh_rlhf['test']
 
print(dataset_train_hh_rlhf)
print(dataset_valid_hh_rlhf)

Dataset({
    features: ['input_text'],
    num_rows: 76380
})
Dataset({
    features: ['input_text'],
    num_rows: 4020
})


In [13]:
combined_dataset_train = concatenate_datasets([dataset_train_alpaca, dataset_train_hh_rlhf]).shuffle(seed=42)
combined_dataset_val = concatenate_datasets([dataset_valid_alpaca, dataset_valid_hh_rlhf]).shuffle(seed=42)

In [14]:
print(combined_dataset_train)
print(combined_dataset_val)

Dataset({
    features: ['input_text'],
    num_rows: 125781
})
Dataset({
    features: ['input_text'],
    num_rows: 6621
})


In [15]:
if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)

In [16]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

In [17]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    evaluation_strategy='steps',
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    save_strategy='steps',
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=3,
    bf16=bf16,
    fp16=fp16,
    # report_to='tensorboard',
    num_train_epochs=3,
    dataloader_num_workers=num_workers,
    # gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    # lr_scheduler_type='constant',
)



In [18]:
trainer = SFTTrainer(
    model=model,
    train_dataset=combined_dataset_train,
    eval_dataset=combined_dataset_val,
    dataset_text_field="input_text",
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    # packing=True
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [19]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mpandraju-s[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
500,2.7946,2.503538
1000,2.6321,2.460795
1500,2.5992,2.450081
2000,2.596,2.445371
2500,2.5943,2.442328
3000,2.5827,2.441772
3500,2.5882,2.441024
4000,2.5848,2.440697
4500,2.5867,2.440515
5000,2.577,2.440104


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=9435, training_loss=2.6009724093923885, metrics={'train_runtime': 5198.6918, 'train_samples_per_second': 72.584, 'train_steps_per_second': 1.815, 'total_flos': 1.33019273348352e+17, 'train_loss': 2.6009724093923885, 'epoch': 3.0})

In [20]:
trainer.save_model(f"{out_dir}/final_model")
tokenizer.save_pretrained(f"{out_dir}/final_model")

('outputs/gpt2_sft_instruction/final_model/tokenizer_config.json',
 'outputs/gpt2_sft_instruction/final_model/special_tokens_map.json',
 'outputs/gpt2_sft_instruction/final_model/vocab.json',
 'outputs/gpt2_sft_instruction/final_model/merges.txt',
 'outputs/gpt2_sft_instruction/final_model/added_tokens.json')

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForCausalLM.from_pretrained('outputs/gpt2_sft_instruction/final_model/')
tokenizer = AutoTokenizer.from_pretrained('outputs/gpt2_sft_instruction/final_model/')
tokenizer.pad_token = tokenizer.eos_token

In [23]:
pipe = pipeline(
    task='text-generation', 
    model=model, 
    tokenizer=tokenizer, 
    max_length=1024, # Prompt + new tokens to generate.
    device=device
)

In [29]:
og_model = AutoModelForCausalLM.from_pretrained(model_name)

In [30]:
og_pipe = pipeline(
    task='text-generation', 
    model=og_model, 
    tokenizer=tokenizer, 
    max_length=1024, # Prompt + new tokens to generate.
    device=device
)

In [24]:
template = """Human:
{}

Assistant:
"""

In [26]:
prompt = template.format("Can you tell me what are the best places to visit in India?")

In [37]:
outputs = og_pipe(
    prompt, 
    do_sample=True, 
    temperature=0.7, 
    top_k=50, 
    top_p=0.95,
    repetition_penalty=1.1,
)
print(outputs[0]['generated_text'])

Human:
Can you tell me what are the best places to visit in India?

Assistant:
-If we go on a tour, it would be easy. It is not something that can make us feel like an ordinary person but there must have been people who were very interesting and knowledgeable about how things work here so I think this has always got some appeal over time as well."
'What was your most important decision for being away from home?' – Vadodara's question

 "I wanted to travel with my family because of our great job at Aam Aadmi Party (AAP) which started working out after 2000 when one member said 'It seems difficult doing jobs'. Even if they didn't want to do any sort Of course no such thing could happen… But everything changed then too!" —Vadoda Kaur


In [39]:
outputs = pipe(
    prompt, 
    do_sample=True, 
    temperature=0.7, 
    top_k=50, 
    top_p=0.95,
    repetition_penalty=1.1,
)
print(outputs[0]['generated_text'])

Human:
Can you tell me what are the best places to visit in India?

Assistant:
Indian cities offer a number of unique experiences. Many Indian cultural destinations, such as Gurgaon, Delhi, Mumbai and Lucknow, all boast stunning views of nature and magnificent architecture. They also provide opportunities for local businesses with abundant resources including shops, restaurants (both traditional and modern), public transport, electricity, health care facilities, medical services, schools, entertainment venues etc. Some popular tourist attractions include Bhutanese temples or Chola-Tibetan shrines; Sri Lanka's beautiful green fields; Rajasthan's great river valley ; Nepal's Himalayan mountains; Kenya's spectacular mountain ranges; Japan, Vietnam's famous coastal city Hanoi; Indonesia's legendary island nation. You can find many more options than just one destination!


