# fine tuning rwkv using Huggingface Trainer

In [None]:
#upgrade sdk library
!pip install -qU sagemaker
!pip install -qU boto3
!pip install -qU botocore

In [None]:
# sagemaker environment setting
import sagemaker
import boto3
import os
import shutil
import sagemaker.huggingface
from sagemaker.djl_inference.model import DJLModel,DeepSpeedModel,HuggingFaceAccelerateModel,DJLPredictor

sagemaker_session = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sagemaker_session is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sagemaker_session.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sagemaker_session = sagemaker.Session(default_bucket=sagemaker_session_bucket)
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name


print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {bucket}")
print(f"sagemaker session region: {region}")

In [None]:
source_dir = 'source_dir'
if not os.path.exists(source_dir):
    os.mkdir(source_dir)
#entry_point = 'entry_point.py'

In [None]:
%%writefile $source_dir/requirements.txt
transformers
torch
accelerate
datasets
numpy
boto3
sagemaker
sentencepiece
nvgpu==0.9.0
pynvml==11.4.1

In [None]:
%%writefile $source_dir/finetune.py
from transformers import (
    RwkvForCausalLM,
    RwkvConfig,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)

from datasets import load_dataset
import torch
import numpy as np
import re

import collections
from typing import Any, Dict
import math

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def remove_url_from_text(text: str):
    """Remove square brackets around linked text and (_URL_0_) after"""
    return re.sub(r"\[|\]|\(_URL_\d+_\)", "", text)

def tokenize_function(examples: Dict[str, Any]) -> Dict[str, Any]:
    """Concatenate and tokenize the answers in flattened ELI5 data"""
    concatenated = [remove_url_from_text(" ".join(x)) for x in examples["answers.text"]]
    return tokenizer(concatenated)


def chunk(examples: Dict[str, Any], chunk_size: int = 256) -> Dict[str, Any]:
    """Concatenate and chunk batches of data"""
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    return {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated.items()
    }


def set_labels(examples: Dict[str, Any]) -> Dict[str, Any]:
    """Add a labels column to the dataset which is a copy of input_ids"""
    examples["labels"] = examples["input_ids"].copy()
    return examples


MODEL_NAME = "sgugger/rwkv-430M-pile"
DATASET = "eli5"
CHUNK_SIZE = 128
TEST_SPLIT_SIZE = 0.2
BATCH_SIZE = 32
DATASET_SPLIT = "train_asks[:500]"

model = RwkvForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset(DATASET, split=DATASET_SPLIT)
dataset = dataset.train_test_split(test_size=TEST_SPLIT_SIZE)
dataset = dataset.flatten()

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    # Encode
encoded_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names
)

# Chunk
chunked_dataset = encoded_dataset.map(
    chunk,
    fn_kwargs={"chunk_size": CHUNK_SIZE},
    batched=True,
)

# Label
lm_dataset = chunked_dataset.map(
    set_labels,
    batched=True
)

training_args = TrainingArguments(
    output_dir = MODEL_NAME + "-" + DATASET,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    logging_steps=len(lm_dataset["train"]) // BATCH_SIZE,
    save_strategy = "no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

# Evaluate before train
eval_0 = trainer.evaluate()
perplexity_0 = math.exp(eval_0["eval_loss"])

# Train
trainer.train()
trainer.save_model('/opt/ml/model')

# Evaluate after train
eval_f = trainer.evaluate()
perplexity_f = math.exp(eval_f["eval_loss"])

In [None]:
from sagemaker.pytorch import PyTorch, TrainingCompilerConfig

# the original max batch size that can fit into GPU memory without compiler
batch_size_native=12
learning_rate_native=float('5e-5')

# an updated max batch size that can fit into GPU memory with compiler
batch_size=64

# update learning rate
learning_rate=learning_rate_native/batch_size_native*batch_size

hyperparameters={
    "n_gpus": 1,
    "batch_size": batch_size,
    "learning_rate": learning_rate
}

pytorch_estimator=PyTorch(
    entry_point='finetune.py',
    source_dir=source_dir, 
    role=role,
    instance_count=1,
    instance_type='ml.g5.2xlarge',
    framework_version='1.13.1',
    py_version='py39',
    #hyperparameters=hyperparameters,
    #compiler_config=TrainingCompilerConfig(),
    disable_profiler=True,
    debugger_hook_config=False
)

pytorch_estimator.fit()