In [1]:
# ! pip install -r requirements.txt

Collecting accelerate==0.28.0 (from -r requirements.txt (line 1))
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Collecting appdirs==1.4.4 (from -r requirements.txt (line 2))
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting bitsandbytes==0.43.0 (from -r requirements.txt (line 3))
  Downloading bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl.metadata (1.8 kB)
Collecting datasets==2.18.0 (from -r requirements.txt (line 4))
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting fire==0.6.0 (from -r requirements.txt (line 5))
  Downloading fire-0.6.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.4/88.4 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting gradio==4.23.0 (from -r requirements.txt (line 6))
  Downloading gradio-4.23.0-py3-none-any.whl.metadata (15 kB)
Collecting loralib==0.1.2 (from -r requirements.txt (

### Data processing from CSV to JSONL

In [1]:
import pandas as pd
import json

df = pd.read_csv("Dataset_refined.csv")

def df_to_jsonl_shuffle(df, filename):
    shuffled_df = df.sample(frac=1)  # Shuffle the DataFrame
    with open(filename, 'w') as file:
        for index, row in shuffled_df.iterrows():
            json_obj = row.to_json()
            file.write(json_obj + '\n')

def split_train_test(df, train_frac=0.9):
    train_size = int(len(df) * train_frac)
    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]
    return train_df, test_df

# Split DataFrame into training and test sets
train_df, test_df = split_train_test(df, train_frac=0.9)

# Convert training DataFrame to JSONL
df_to_jsonl_shuffle(train_df, 'data/train/train.jsonl')

# Convert test DataFrame to JSONL
df_to_jsonl_shuffle(test_df, 'data/validation/test.jsonl')


### prompt template creation and Prompt completion testing

In [7]:
x = {
    "java_context": "Below code demonstrates how to perform bisecting k-means clustering and evaluate the clustering performance using Spark MLlib in Java. Initially, a SparkSession is created. Then, the code loads a dataset in LIBSVM format using Spark's read method. Subsequently, a BisectingKMeans model is trained on the dataset with a specified number of clusters (K) and a seed value for reproducibility. Predictions are made on the dataset using the trained model, and the clustering quality is evaluated using the Silhouette score. Finally, the cluster centers are displayed, providing insights into the characteristics of each cluster. ",
    "java": "```java\npackage org.apache.spark.examples.ml;\n\n// $example on$\nimport org.apache.spark.ml.clustering.BisectingKMeans;\nimport org.apache.spark.ml.clustering.BisectingKMeansModel;\nimport org.apache.spark.ml.evaluation.ClusteringEvaluator;\nimport org.apache.spark.ml.linalg.Vector;\nimport org.apache.spark.sql.Dataset;\nimport org.apache.spark.sql.Row;\n// $example off$\nimport org.apache.spark.sql.SparkSession;\n\n\n/**\n * An example demonstrating bisecting k-means clustering.\n * Run with\n * <pre>\n * bin/run-example ml.JavaBisectingKMeansExample\n * </pre>\n */\npublic class JavaBisectingKMeansExample {\n\n  public static void main(String[] args) {\n    SparkSession spark = SparkSession\n      .builder()\n      .appName(\"JavaBisectingKMeansExample\")\n      .getOrCreate();\n\n    // $example on$\n    // Loads data.\n    Dataset<Row> dataset = spark.read().format(\"libsvm\").load(\"data/mllib/sample_kmeans_data.txt\");\n\n    // Trains a bisecting k-means model.\n    BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1);\n    BisectingKMeansModel model = bkm.fit(dataset);\n\n    // Make predictions\n    Dataset<Row> predictions = model.transform(dataset);\n\n    // Evaluate clustering by computing Silhouette score\n    ClusteringEvaluator evaluator = new ClusteringEvaluator();\n\n    double silhouette = evaluator.evaluate(predictions);\n    System.out.println(\"Silhouette with squared euclidean distance = \" + silhouette);\n\n    // Shows the result.\n    System.out.println(\"Cluster Centers: \");\n    Vector[] centers = model.clusterCenters();\n    for (Vector center : centers) {\n      System.out.println(center);\n    }\n    // $example off$\n\n    spark.stop();\n  }\n}```",
    "python_context": "Python equivalent code which demonstrates how to perform bisecting k-means clustering and evaluate the clustering performance utilizes PySpark's MLlib library. Firstly, a SparkSession is initialized. Then, the code loads a dataset in LIBSVM format using Spark's read method. Subsequently, a BisectingKMeans model is trained on the dataset with a specified number of clusters (K) and a seed value for reproducibility. Predictions are made on the dataset using the trained model, and the clustering quality is evaluated using the Silhouette score. Finally, the cluster centers are displayed, offering insights into the characteristics of each cluster. ",
    "python": "```python\nfrom pyspark.ml.clustering import BisectingKMeans\nfrom pyspark.ml.evaluation import ClusteringEvaluator\n# $example off$\nfrom pyspark.sql import SparkSession\n\nif __name__ == \"__main__\":\n    spark = SparkSession\\\n        .builder\\\n        .appName(\"BisectingKMeansExample\")\\\n        .getOrCreate()\n\n    # $example on$\n    # Loads data.\n    dataset = spark.read.format(\"libsvm\").load(\"data/mllib/sample_kmeans_data.txt\")\n\n    # Trains a bisecting k-means model.\n    bkm = BisectingKMeans().setK(2).setSeed(1)\n    model = bkm.fit(dataset)\n\n    # Make predictions\n    predictions = model.transform(dataset)\n\n    # Evaluate clustering by computing Silhouette score\n    evaluator = ClusteringEvaluator()\n\n    silhouette = evaluator.evaluate(predictions)\n    print(\"Silhouette with squared euclidean distance = \" + str(silhouette))\n\n    # Shows the result.\n    print(\"Cluster Centers: \")\n    centers = model.clusterCenters()\n    for center in centers:\n        print(center)\n    # $example off$\n\n    spark.stop()```"
}

In [2]:
import json

template = {
    "prompt": """Below is an instruction that describes a problem and its code implementation in Java. Write a response which converts the Java implementation to an implementation in Python.
Problem Context:
{java_context}
Code in Java:
{java}
""",
    "completion": """ 
Solution Context:
{python_context}

Code in Python:
{python}
""",
}
with open("./data/template.json", "w") as f:
    json.dump(template, f)

In [7]:
import json

# Open the JSON file
with open('./data/template.json', 'r') as file:
    # Load the JSON data
    data = json.load(file)

# Now you can access the data as a dictionary
# For example, if your JSON file has a key named 'name', you can access it like this:
name = data
name

{'prompt': 'Below is an instruction that describes a problem and its code implementation in Java. Write a response which converts the Java implementation to an implementation in Python.\n\nProblem Context:\n{java_context}\n\nCode in Java:\n{java}\n',
 'completion': ' \nSolution Context:\n{python_context}\n                        \nCode in Python:\n{python}\n'}

In [9]:
prompt_template = f"""### Instruction:

{template['prompt']}

### Response:

{template['completion']}

"""

prompt_template

'### Instruction:\n\nBelow is an instruction that describes a problem and its code implementation in Java. Write a response which converts the Java implementation to an implementation in Python.\n\nProblem Context:\n{java_context}\n\nCode in Java:\n{java}\n\n\n### Response:\n\n \nSolution Context:\n{python_context}\n                        \nCode in Python:\n{python}\n\n\n'

In [10]:
y = prompt_template.format(**x)
y

'### Instruction:\n\nBelow is an instruction that describes a problem and its code implementation in Java. Write a response which converts the Java implementation to an implementation in Python.\n\nProblem Context:\nBelow code demonstrates how to perform bisecting k-means clustering and evaluate the clustering performance using Spark MLlib in Java. Initially, a SparkSession is created. Then, the code loads a dataset in LIBSVM format using Spark\'s read method. Subsequently, a BisectingKMeans model is trained on the dataset with a specified number of clusters (K) and a seed value for reproducibility. Predictions are made on the dataset using the trained model, and the clustering quality is evaluated using the Silhouette score. Finally, the cluster centers are displayed, providing insights into the characteristics of each cluster. \n\nCode in Java:\n```java\npackage org.apache.spark.examples.ml;\n\n// $example on$\nimport org.apache.spark.ml.clustering.BisectingKMeans;\nimport org.apache

Note to self: 
- Loading a 13B Code Llama model requires instances bigger than g5.12xlarge and g4dn.12xlarge
- Both have 4 GPU's 96GB GPU memory, 48vCPU's and 192GB CPU memory (RAM)
- When loading the 13B pretrained model, the RAM is not sufficient. Need a bigger instance
- If I force the model to load using GPU, by setting device_map="cuda" in AutoModelForCausalLM.from_pretrained, I'm hitting GPU OOM errors
    -  CUDA out of memory. Tried to allocate 100.00 MiB. GPU 0 has a total capacity of 21.99 GiB of which 47.06 MiB is free. 
- Using a 13B model for testing seems to be greedy. Switching to 7B models

In [None]:
%%sh

python train.py \
    --model_dir ./models/CodeLlama-7b-Python-HF \
    --enable_fsdp True \
    --fsdp_checkpoint_root_dir ./checkpoints/CodeLlama-7b-Python-HF \
    --num_epochs 1 \
    --int8_quantization False \
    --learning_rate 0.001 \
    --seed 10 \
    --use_peft True \
    --peft_output_dir ./output/CodeLlama-7b-Python-HF \
    --train_dir ./data/train \
    --validation_dir ./data/validation \
    --file_extension jsonl \
    --prompt_template ./data/template.json
    --model_output_dir ./finetuned_model/CodeLlama-7b-Python-HF/run1

INFO:root:Finetuning Args: Namespace(model_dir='./models/CodeLlama-7b-Python-HF', per_device_train_batch_size=4, batching_strategy='packing', context_length=4096, gradient_accumulation_steps=1, gradient_clipping=False, gradient_clipping_threshold=1.0, num_epochs=1, num_workers_dataloader=1, learning_rate=0.001, weight_decay=0.0, gamma=0.85, seed=10, int8_quantization=False, freeze_layers=False, num_freeze_layers=1, use_fast_kernels=False, save_metrics=False, run_validation=True, val_batch_size=1, enable_fsdp=True, fsdp_checkpoint_root_dir='./checkpoints/CodeLlama-7b-Python-HF', low_cpu_fsdp=False, mixed_precision=True, use_fp16=False, pure_bf16=False, optimizer='AdamW', save_optimizer=False, use_peft=True, peft_method='lora', peft_output_dir='./output/CodeLlama-7b-Python-HF', lora_r=8, lora_alpha=32, lora_dropout=0.05, target_modules='q_proj,v_proj', train_dir='./data/vivek/train', validation_dir='./data/vivek/validation', file_extension='jsonl', prompt_template='./data/vivek/template.

--> Running with torch dist debug set to detail


INFO:root:Local rank is 2. Rank is 2. World Size is 4
INFO:root:Setting torch device = 2
INFO:root:Local rank is 3. Rank is 3. World Size is 4
INFO:root:Setting torch device = 3
INFO:root:Local rank is 1. Rank is 1. World Size is 4
INFO:root:Setting torch device = 1
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
INFO:root:Loading the tokenizer.
INFO:root:Loading the tokenizer.
INFO:root:Loading the tokenizer.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Generating train split: 90 examples [00:00, 469.17 examples/s]
Generating validation split: 10 examples [00:00, 7456.54 examples/s]
Map: 100%|██████████| 90/90 [00:00<00:00, 9177.02 examples/s]
Map: 100%|██████████| 90/90 [00:00<00:00, 8199.30 examples/s]
Map: 100%|██████████| 

--> Model ./models/CodeLlama-7b-Python-HF

--> ./models/CodeLlama-7b-Python-HF has 6738.415616 Million params



INFO:root:Using PEFT


trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


INFO:root:Setting up FSDP if enable_fsdp is enabled


bFloat16 enabled for mixed precision - using bfSixteen policy


Loading checkpoint shards: 100%|██████████| 3/3 [50:22<00:00, 1007.52s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [50:22<00:00, 1007.50s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [50:22<00:00, 1007.50s/it]
INFO:root:Printing Model Size
INFO:root:Using PEFT
INFO:root:Printing Model Size
INFO:root:Using PEFT
INFO:root:Printing Model Size
INFO:root:Using PEFT


trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


INFO:root:Setting up FSDP if enable_fsdp is enabled


trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


INFO:root:Setting up FSDP if enable_fsdp is enabled


trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


INFO:root:Setting up FSDP if enable_fsdp is enabled


--> applying fsdp activation checkpointing...


INFO:root:Initializing the optimizer and learning rate scheduler
INFO:root:Starting the training process
Training Epoch: 1:   0%|[34m          [0m| 0/4 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


--> applying fsdp activation checkpointing...


INFO:root:Initializing the optimizer and learning rate scheduler
INFO:root:Starting the training process


--> applying fsdp activation checkpointing...
--> applying fsdp activation checkpointing...


Training Epoch: 1:   0%|[34m          [0m| 0/4 [00:00<?, ?it/s]INFO:root:Initializing the optimizer and learning rate scheduler
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
INFO:root:Starting the training process
INFO:root:Initializing the optimizer and learning rate scheduler
INFO:root:Starting the training process
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training Epoch: 1:   0%|[34m          [0m| 0/4 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already bee

Max CUDA memory allocated was 18 GB
Max CUDA memory reserved was 20 GB
Peak active CUDA memory was 18 GB
CUDA Malloc retries : 1
CPU Total Peak Memory consumed during the train (max): 2 GB


evaluating Epoch:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alre

 eval_ppl=tensor(1.9563, device='cuda:0') eval_epoch_loss=tensor(0.6711, device='cuda:0')
we are about to save the PEFT modules




PEFT modules are saved in ./output/CodeLlama-7b-Python-HF directory
best eval loss on epoch 1 is 0.671068012714386
Epoch 1: train_perplexity=2.0012, train_epoch_loss=0.6938, epoch time 56.48963721600012s


INFO:root:Training process complete
INFO:root:Training process complete
INFO:root:Training process complete
INFO:root:Training process complete
INFO:root:Key: avg_train_prep, Value: 2.001217842102051
INFO:root:Key: avg_train_loss, Value: 0.6937559843063354
INFO:root:Key: avg_eval_prep, Value: 1.9563255310058594
INFO:root:Key: avg_eval_loss, Value: 0.671068012714386
INFO:root:Key: avg_epoch_time, Value: 56.48963721600012
INFO:root:Key: avg_checkpoint_time, Value: 9.73087351400045
INFO:root:Combining pre-trained base model with the PEFT adapter module.
Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  2.93it/s]
INFO:root:Saving the combined model in safetensors format.
INFO:root:Saving complete.
INFO:root:Saving the tokenizer.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
INFO:root:Saving complete.


In [5]:
# Retrying the above command to save peft model as I made a mistake of passing incorrect arg parameter for base model

### Training CodeLlama Python-7b Run 1

In [3]:
%%sh

python train.py \
    --model_dir ./models/CodeLlama-7b-Python-HF \
    --enable_fsdp True \
    --fsdp_checkpoint_root_dir ./checkpoints/CodeLlama-7b-Python-HF \
    --num_epochs 1 \
    --int8_quantization False \
    --learning_rate 0.001 \
    --seed 10 \
    --use_peft True \
    --peft_output_dir ./output/CodeLlama-7b-Python-HF \
    --train_dir ./data/train \
    --validation_dir ./data/validation \
    --file_extension jsonl \
    --prompt_template ./data/template.json \
    --model_output_dir ./finetuned_model/CodeLlama-7b-Python-HF/run1

INFO:root:Finetuning Args: Namespace(model_dir='./models/CodeLlama-7b-Instruct-HF', per_device_train_batch_size=4, batching_strategy='packing', context_length=4096, gradient_accumulation_steps=1, gradient_clipping=False, gradient_clipping_threshold=1.0, num_epochs=5, num_workers_dataloader=1, learning_rate=0.001, weight_decay=0.0, gamma=0.85, seed=10, int8_quantization=False, freeze_layers=False, num_freeze_layers=1, use_fast_kernels=False, save_metrics=False, run_validation=True, val_batch_size=1, enable_fsdp=True, fsdp_checkpoint_root_dir='./checkpoints/CodeLlama-7b-Instruct-HF', low_cpu_fsdp=False, mixed_precision=True, use_fp16=False, pure_bf16=False, optimizer='AdamW', save_optimizer=False, use_peft=True, peft_method='lora', peft_output_dir='./output/CodeLlama-7b-Instruct-HF', lora_r=8, lora_alpha=32, lora_dropout=0.05, target_modules='q_proj,v_proj', train_dir='./data/vivek/train', validation_dir='./data/vivek/validation', file_extension='jsonl', prompt_template='./data/vivek/tem

--> Running with torch dist debug set to detail


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
INFO:root:Loading the tokenizer.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
INFO:root:Loading the tokenizer.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
INFO:root:Loading the tokenizer.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
INFO:root:Using the default value of max_input_length=2048.
INFO:root:--> Training Set Length = 107
INFO:root:--> Validation Set Length = 7
INFO:root:Loading the pre-trained model and setup its configuration
INFO:root:Model Name: ./models/CodeLlama-7b-Instruct-HF
INFO:root:enable_fsdp is set to True and low_cpu_fsdp is set to False
INFO:root:Using the default value of max_input_length=2048.
INFO:root:Using the default value of max_input_length=2048.
INFO:root:Loading the pre-trained model and setup its configuration
INFO:root:Model Name: .

--> Model ./models/CodeLlama-7b-Instruct-HF

--> ./models/CodeLlama-7b-Instruct-HF has 6738.546688 Million params



INFO:root:Using PEFT


trainable params: 4,194,304 || all params: 6,742,740,992 || trainable%: 0.06220473254091146


INFO:root:Setting up FSDP if enable_fsdp is enabled


bFloat16 enabled for mixed precision - using bfSixteen policy


Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.81s/it]
INFO:root:Printing Model Size
INFO:root:Using PEFT
Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.81s/it]
INFO:root:Printing Model Size
INFO:root:Using PEFT
Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.82s/it]
INFO:root:Printing Model Size
INFO:root:Using PEFT


trainable params: 4,194,304 || all params: 6,742,740,992 || trainable%: 0.06220473254091146


INFO:root:Setting up FSDP if enable_fsdp is enabled


trainable params: 4,194,304 || all params: 6,742,740,992 || trainable%: 0.06220473254091146


INFO:root:Setting up FSDP if enable_fsdp is enabled


trainable params: 4,194,304 || all params: 6,742,740,992 || trainable%: 0.06220473254091146


INFO:root:Setting up FSDP if enable_fsdp is enabled


--> applying fsdp activation checkpointing...


INFO:root:Initializing the optimizer and learning rate scheduler
INFO:root:Starting the training process


--> applying fsdp activation checkpointing...


Training Epoch: 1:   0%|[34m          [0m| 0/6 [00:00<?, ?it/s]INFO:root:Initializing the optimizer and learning rate scheduler
INFO:root:Starting the training process


--> applying fsdp activation checkpointing...
--> applying fsdp activation checkpointing...


Training Epoch: 1:   0%|[34m          [0m| 0/6 [00:00<?, ?it/s]INFO:root:Initializing the optimizer and learning rate scheduler
INFO:root:Starting the training process
INFO:root:Initializing the optimizer and learning rate scheduler
INFO:root:Starting the training process
Training Epoch: 1/5, step 5/6 completed (loss: 0.47047489881515503): 100%|[34m██████████[0m| 6/6 [01:20<00:00, 13.40s/it]
Training Epoch: 1/5, step 5/6 completed (loss: 0.534217894077301): 100%|[34m██████████[0m| 6/6 [01:20<00:00, 13.47s/it]
Training Epoch: 1/5, step 5/6 completed (loss: 0.5123025178909302): 100%|[34m██████████[0m| 6/6 [01:20<00:00, 13.43s/it]
Training Epoch: 1/5, step 5/6 completed (loss: 0.4984665513038635): 100%|[34m██████████[0m| 6/6 [01:20<00:00, 13.39s/it]


Max CUDA memory allocated was 18 GB
Max CUDA memory reserved was 20 GB
Peak active CUDA memory was 18 GB
CUDA Malloc retries : 1
CPU Total Peak Memory consumed during the train (max): 2 GB


evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.88s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.88s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.88s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.88s/it]


 eval_ppl=tensor(1.7931, device='cuda:0') eval_epoch_loss=tensor(0.5839, device='cuda:0')
we are about to save the PEFT modules




PEFT modules are saved in ./output/CodeLlama-7b-Instruct-HF directory
best eval loss on epoch 1 is 0.5839325785636902
Epoch 1: train_perplexity=1.9117, train_epoch_loss=0.6480, epoch time 81.15726579400143s


Training Epoch: 2/5, step 5/6 completed (loss: 0.40087318420410156): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]
Training Epoch: 2/5, step 5/6 completed (loss: 0.3831484913825989): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]
Training Epoch: 2/5, step 5/6 completed (loss: 0.4410686790943146): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]
Training Epoch: 2/5, step 5/6 completed (loss: 0.41780155897140503): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]


Max CUDA memory allocated was 18 GB
Max CUDA memory reserved was 20 GB
Peak active CUDA memory was 18 GB
CUDA Malloc retries : 101
CPU Total Peak Memory consumed during the train (max): 2 GB


evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]


 eval_ppl=tensor(1.7218, device='cuda:0') eval_epoch_loss=tensor(0.5434, device='cuda:0')
we are about to save the PEFT modules
PEFT modules are saved in ./output/CodeLlama-7b-Instruct-HF directory
best eval loss on epoch 2 is 0.5433874130249023
Epoch 2: train_perplexity=1.6324, train_epoch_loss=0.4900, epoch time 78.89981068899942s


Training Epoch: 3/5, step 5/6 completed (loss: 0.39495712518692017): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]
Training Epoch: 3/5, step 5/6 completed (loss: 0.33214330673217773): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]
Training Epoch: 3/5, step 5/6 completed (loss: 0.36514174938201904): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]
Training Epoch: 3/5, step 5/6 completed (loss: 0.3530711531639099): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]


Max CUDA memory allocated was 18 GB
Max CUDA memory reserved was 20 GB
Peak active CUDA memory was 18 GB
CUDA Malloc retries : 201
CPU Total Peak Memory consumed during the train (max): 2 GB


evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.85s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.85s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.85s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.85s/it]


 eval_ppl=tensor(1.7175, device='cuda:0') eval_epoch_loss=tensor(0.5409, device='cuda:0')
we are about to save the PEFT modules
PEFT modules are saved in ./output/CodeLlama-7b-Instruct-HF directory
best eval loss on epoch 3 is 0.5408512949943542
Epoch 3: train_perplexity=1.5396, train_epoch_loss=0.4315, epoch time 78.87803173200155s


Training Epoch: 4/5, step 5/6 completed (loss: 0.3564712107181549): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it] 
Training Epoch: 4/5, step 5/6 completed (loss: 0.2951991558074951): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]
Training Epoch: 4/5, step 5/6 completed (loss: 0.32806456089019775): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]
Training Epoch: 4/5, step 5/6 completed (loss: 0.3126378357410431): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.07s/it]


Max CUDA memory allocated was 18 GB
Max CUDA memory reserved was 20 GB
Peak active CUDA memory was 18 GB
CUDA Malloc retries : 301
CPU Total Peak Memory consumed during the train (max): 2 GB


evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]


 eval_ppl=tensor(1.7169, device='cuda:0') eval_epoch_loss=tensor(0.5405, device='cuda:0')
we are about to save the PEFT modules
PEFT modules are saved in ./output/CodeLlama-7b-Instruct-HF directory
best eval loss on epoch 4 is 0.5405406951904297
Epoch 4: train_perplexity=1.4750, train_epoch_loss=0.3886, epoch time 78.8627174860012s


Training Epoch: 5/5, step 5/6 completed (loss: 0.2932030260562897): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.06s/it]]
Training Epoch: 5/5, step 5/6 completed (loss: 0.2609100043773651): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.06s/it]
Training Epoch: 5/5, step 5/6 completed (loss: 0.3223790228366852): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.06s/it]
Training Epoch: 5/5, step 5/6 completed (loss: 0.27691683173179626): 100%|[34m██████████[0m| 6/6 [01:18<00:00, 13.06s/it]


Max CUDA memory allocated was 18 GB
Max CUDA memory reserved was 20 GB
Peak active CUDA memory was 18 GB
CUDA Malloc retries : 401
CPU Total Peak Memory consumed during the train (max): 2 GB


evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]
evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]

evaluating Epoch: 100%|[32m██████████[0m| 2/2 [00:05<00:00,  2.87s/it]


 eval_ppl=tensor(1.7366, device='cuda:0') eval_epoch_loss=tensor(0.5520, device='cuda:0')
Epoch 5: train_perplexity=1.4203, train_epoch_loss=0.3508, epoch time 78.84463565199985s


INFO:root:Training process complete
INFO:root:Training process complete
INFO:root:Training process complete
INFO:root:Training process complete
INFO:root:Key: avg_train_prep, Value: 1.5957965135574341
INFO:root:Key: avg_train_loss, Value: 0.46181475520133974
INFO:root:Key: avg_eval_prep, Value: 1.7371895790100098
INFO:root:Key: avg_eval_loss, Value: 0.5498505353927612
INFO:root:Key: avg_epoch_time, Value: 79.32849227060069
INFO:root:Key: avg_checkpoint_time, Value: 5.268354036599339
INFO:root:Combining pre-trained base model with the PEFT adapter module.
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  3.00it/s]
INFO:root:Saving the combined model in safetensors format.
INFO:root:Saving complete.
INFO:root:Saving the tokenizer.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
INFO:root:Saving complete.
INFO:root:Model Tarring process has begun
INFO:root:Uploading to S3
INFO:botocore.credentials:Found credentials from IAM Role: BaseN

In [None]:
nohup python train.py --model_dir ./models/CodeLlama-7b-Python-HF --enable_fsdp True --fsdp_checkpoint_root_dir ./checkpoints/CodeLlama-7b-Python-HF --num_epochs 5 \--int8_quantization False \
    --learning_rate 0.001 \
    --seed 10 \
    --use_peft True \
    --peft_output_dir ./output/CodeLlama-7b-Python-HF \
    --train_dir ./data/train \
    --validation_dir ./data/validation \
    --file_extension jsonl \
    --prompt_template ./data/template.json \
    --model_output_dir ./finetuned_model/CodeLlama-7b-Python-HF/run1 &