In [None]:
# !git clone https://github.com/pranavvm26/transformers.git /tmp/transformers
# !python3 -m pip install -e /tmp/transformers/

In [None]:
# !python3 -m pip install -U torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118

In [None]:
# !python3 -m pip install scipy==1.12.0

In [None]:
# !python3 -m pip install -U bitsandbytes==0.43.0
# !python3 -m pip install -U peft==0.8.1
# !python3 -m pip install -U datasets==2.18.0 
# !python3 -m pip install -U tensorboardX==2.6.2.2
# !python3 -m pip install -U py7zr==0.21.0
# !python3 -m pip install -U einops==0.7.0

# Load Model

In [1]:
import os
import shutil
import torch
from accelerate import Accelerator
import transformers
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig
)
from peft import (
    prepare_model_for_kbit_training, 
    LoraConfig, 
    get_peft_model, 
    AutoPeftModelForCausalLM
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f"HF transformer version: {transformers.__version__}")

HF transformer version: 4.39.0.dev0.greedy


### To-do remove quantization, need raw results

In [3]:
model_id = "tiiuae/falcon-7b"

# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# The code is provided by the model authors in the repo.
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    # quantization_config=bnb_config, 
    device_map="auto",
    torch_dtype=torch.float16
)

Downloading shards: 100%|██████████| 2/2 [01:12<00:00, 36.02s/it]
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.51s/it]


In [4]:
# Set the Falcon tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
config = LoraConfig(
    r=1024,
    lora_alpha=3072,
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 2088763392 || all params: 9010484096 || trainable%: 23.18147803986757


# Dataset

In [8]:
from random import randrange
from random import randint
from datasets import load_dataset

In [10]:
dataset_name = "w601sxs/simpleCoT" # "databricks/databricks-dolly-15k"

In [11]:
# Load dataset from the hub
train_dataset = load_dataset(dataset_name, split="train[:3000]")
validation_dataset = load_dataset(dataset_name, split="train[3000:3100]")
test_dataset = load_dataset(dataset_name, split="train[3100:3200]")

Downloading readme: 100%|██████████| 427/427 [00:00<00:00, 3.39MB/s]
Downloading data: 100%|██████████| 219M/219M [00:02<00:00, 77.1MB/s] 
Downloading data: 100%|██████████| 218M/218M [00:07<00:00, 30.2MB/s] 
Downloading data: 100%|██████████| 219M/219M [00:04<00:00, 49.4MB/s] 
Downloading data: 100%|██████████| 219M/219M [00:07<00:00, 31.0MB/s] 
Downloading data: 100%|██████████| 219M/219M [00:03<00:00, 61.4MB/s] 
Downloading data: 100%|██████████| 379M/379M [00:07<00:00, 51.5MB/s] 
Downloading data: 100%|██████████| 477M/477M [00:05<00:00, 80.2MB/s] 
Generating train split: 100%|██████████| 2214941/2214941 [00:08<00:00, 263996.58 examples/s]


In [12]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 3000
Validation dataset size: 100
Test dataset size: 100


In [13]:
def format_simple_cot(sample):
    instruction = f"### Instruction\n{sample['source']}"
    answer = f"### Answer\n{sample['target']}" 
    response = f"### Rationale\n{sample['rationale']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, answer, response] if i is not None])
    return prompt

In [14]:
from random import randint
from itertools import chain
from functools import partial


# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_simple_cot(sample)}{tokenizer.eos_token}"
    return sample


# apply prompt template per sample
# train
train_dataset = train_dataset.map(template_dataset, remove_columns=list(train_dataset.features))
# validation
validation_dataset = validation_dataset.map(template_dataset, remove_columns=list(validation_dataset.features))
# test
test_dataset = test_dataset.map(template_dataset, remove_columns=list(test_dataset.features))

# print random sample
print(validation_dataset[randint(0, len(validation_dataset))]["text"])

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

def chunk(sample, chunk_length=2048):
    
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result

Map: 100%|██████████| 3000/3000 [00:00<00:00, 14313.73 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 10267.57 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 10458.57 examples/s]

### Instruction
Given an abstract, generate a keyword (a noun phrase) that best describes the focus or contribution of the paper. Such keywords can be directly from the given abstract or outside it.

Abstract: Small amounts (0.1-0.5 mM) of deoxycholate enhanced amylase secretion, which had been induced by submaximal doses of carbachol or cholecystokinin octapeptide, without affecting the maximal levels of these reactions from isolated rat pancreatic acini. Deoxycholate alone did not induce these reactions. The other bile acids such as cholate, chenodeoxycholate, ursodeoxycholate, and taurocholate were also active. Under the similar conditions, deoxycholate enhanced the secretagogue-induced diacylglycerol formation that was derived mainly from the phospholipase C-mediated hydrolysis of phosphatidylinositol and phosphatidylinositol-4-monophosphate. Deoxycholate did not enhance the secretagogue-induced hydrolysis of phosphatidylinositol-4,5-bisphosphate or Ca2+ mobilization. Deoxycholate 




In [15]:
# training
lm_train_dataset = train_dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(train_dataset.features)
).map(
    partial(chunk, chunk_length=2048),
    batched=True,
)

# validation
lm_valid_dataset = validation_dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(validation_dataset.features)
).map(
    partial(chunk, chunk_length=2048),
    batched=True,
)

# validation
lm_test_dataset = test_dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(test_dataset.features)
).map(
    partial(chunk, chunk_length=2048),
    batched=True,
)

# Print total number of samples
print(f"Train Length : {len(lm_train_dataset)} || Val Length : {len(lm_valid_dataset)} || Test Length : {len(lm_test_dataset)}")

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5297 > 2048). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 3000/3000 [00:00<00:00, 4870.87 examples/s]
Map: 100%|██████████| 3000/3000 [00:00<00:00, 3335.38 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 4401.57 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 3281.95 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 3981.61 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 2976.18 examples/s]

Train Length : 398 || Val Length : 12 || Test Length : 13





# Train Model

In [16]:
lr_schedulers = ['cosine', 'constant', 'greedy', 'linear']
chosen_scheduler = -2

In [17]:
logging_dir = f"./model-outputs/notebook/{lr_schedulers[chosen_scheduler]}/tensorboard"
output_dir = f"./model-outputs/notebook/{lr_schedulers[chosen_scheduler]}/output"

In [18]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=lm_train_dataset,
    eval_dataset=lm_valid_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=3,
        per_device_eval_batch_size=3,
        logging_dir=logging_dir,
        logging_steps=2,
        num_train_epochs=3,
        learning_rate=1e-5,
        bf16=False,
        save_strategy="no",
        output_dir=output_dir,
        report_to="tensorboard",
        lr_scheduler_type=lr_schedulers[chosen_scheduler],
        factor=0.9,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(
        tokenizer, 
        mlm=False
    )
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

GreedyLR settings: patience=10 smooth=False min_lr=0.001 factor=0.9


Step,Training Loss
2,1.966
4,2.1091


# Save Fine-Tuned Model

In [None]:
temp_dir = "./temp-model-dir"
model_dir = "./fine-tuned-model-dir"

In [None]:
trainer.model.save_pretrained(temp_dir, safe_serialization=False)

In [None]:
# clear memory
del model
del trainer
torch.cuda.empty_cache()

In [None]:
model = AutoPeftModelForCausalLM.from_pretrained(
    temp_dir,
    # low_cpu_mem_usage=True,
    device_map="auto",
    torch_dtype=torch.float16,
)
model = model.merge_and_unload()

In [None]:
model.save_pretrained(
    model_dir, 
    safe_serialization=True, 
    max_shard_size="9GB"
)
tokenizer.save_pretrained(
    save_directory=model_dir, 
    from_pt=True
)

In [None]:
shutil.rmtree(temp_dir)

# Run Model Evaluation

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "./fine-tuned-model-dir",
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "./fine-tuned-model-dir"
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

In [None]:
def format_simple_cot_test(sample):
    instruction = f"### Instruction\n{sample['source']}."
    answer = f"### Answer\n" 
    # response = f"### Rationale\n"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, answer] if i is not None])
    return prompt

In [None]:
sample = format_simple_cot_test(test_dataset[2])

In [None]:
print(sample)

In [None]:
sequences = pipeline(
    sample,
    max_length=200,
    do_sample=True,
    top_k=50,
    temperature=0.1,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

for seq in sequences:
    print(f"Result: \n{seq['generated_text']}")

In [None]:
test_dataset[2]

In [1]:
import sagemaker

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
sess = sagemaker.Session()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [7]:
from sagemaker.experiments.experiment import Experiment

exp = Experiment.load(experiment_name="greedylr-experimentation", sagemaker_session=sess)
exp._delete_all(action="--force")