In [None]:
!cd /project/data && git lfs clone https://huggingface.co/datasets/databricks/databricks-dolly-15k

In [None]:
!python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/preprocess.py --input /project/data/databricks-dolly-15k/databricks-dolly-15k.jsonl

In [None]:
!head -n 1 /project/data/databricks-dolly-15k/databricks-dolly-15k-output.jsonl

In [None]:
import json
import random

input_file = "/project/data/databricks-dolly-15k/databricks-dolly-15k-output.jsonl"
training_output_file = "/project/data/databricks-dolly-15k/training.jsonl"
validation_output_file = "/project/data/databricks-dolly-15k/validation.jsonl"
test_output_file = "/project/data/databricks-dolly-15k/test.jsonl"

# Specify the proportion of data for training and validation
train_proportion = 0.80
validation_proportion = 0.15
test_proportion = 0.05

# Read the JSONL file and shuffle the JSON objects
with open(input_file, "r") as f:
    lines = f.readlines()
    random.shuffle(lines)

# Calculate split indices
total_lines = len(lines)
train_index = int(total_lines * train_proportion)
val_index = int(total_lines * validation_proportion)

# Distribute JSON objects into training and validation sets
train_data = lines[:train_index]
validation_data = lines[train_index:train_index+val_index]
test_data = lines[train_index+val_index:]

# Write JSON objects to training file
with open(training_output_file, "w") as f:
    for line in train_data:
        f.write(line.strip() + "\n")

# Write JSON objects to validation file
with open(validation_output_file, "w") as f:
    for line in validation_data:
        f.write(line.strip() + "\n")

# Write JSON objects to training file
with open(test_output_file, "w") as f:
    for line in test_data:
        f.write(line.strip() + "\n")

In [None]:
MODEL="/project/models/llama2-7b.nemo"
TRAIN="/project/data/databricks-dolly-15k/train.jsonl"
VALID="/project/data/databricks-dolly-15k/validation.jsonl"
TEST="/project/data/databricks-dolly-15k/test.jsonl"
VALID_NAMES="databricks-dolly-15k"
CONCAT_SAMPLING_PROBS="[1]"
TP_SIZE=1
PP_SIZE=1

In [None]:
config = OmegaConf.load("/opt/NeMo/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml")

In [None]:
trainer.precision="bf16" 
trainer.devices=8 
trainer.num_nodes=1 
trainer.val_check_interval=0.1 
trainer.max_steps=50 
model.restore_from_path=MODEL
model.micro_batch_size=1 
model.global_batch_size=128 
model.tensor_model_parallel_size=TP_SIZE
model.pipeline_model_parallel_size=PP_SIZE
model.megatron_amp_O2=True 
model.sequence_parallel=True 
model.activations_checkpoint_granularity="selective" 
model.activations_checkpoint_method="uniform"
model.optim.name="distributed_fused_adam" 
model.optim.lr=5e-6 
model.answer_only_loss=True
model.data.train_ds.file_names=TRAIN_DS
model.data.validation_ds.file_names=VALID_DS
model.data.test_ds.file_names=TEST_DS
model.data.train_ds.concat_sampling_probabilities=CONCAT_SAMPLING_PROBS
model.data.train_ds.max_seq_length=2048 
model.data.validation_ds.max_seq_length=2048 
model.data.train_ds.micro_batch_size=1 
model.data.train_ds.global_batch_size=128 
model.data.validation_ds.micro_batch_size=1 
model.data.validation_ds.global_batch_size=128 
model.data.test_ds.micro_batch_size=1 
model.data.test_ds.global_batch_size=256 
model.data.train_ds.num_workers=0 
model.data.validation_ds.num_workers=0
model.data.test_ds.num_workers=0 
model.data.validation_ds.metric.name="loss" 
model.data.test_ds.metric.name="loss" 
exp_manager.create_wandb_logger=False 
exp_manager.explicit_log_dir="/project/code/llama-2-7b/results"
exp_manager.resume_if_exists=True 
exp_manager.resume_ignore_no_checkpoint=True 
exp_manager.create_checkpoint_callback=True 
exp_manager.checkpoint_callback_params.monitor="validation_loss" 
exp_manager.checkpoint_callback_params.save_best_model=False 
exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True 

In [None]:
OmegaConf.save(config, "llama-config.yaml")

In [None]:
!mv llama-config.yaml /opt/NeMo/examples/nlp/language_modeling/tuning/conf/

In [None]:
!python3 /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py --config-name=llama-config.yaml