In [2]:
pip install datasets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[K     |████████████████████████████████| 480 kB 3.5 MB/s eta 0:00:01
Collecting pyarrow>=15.0.0
  Downloading pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl (40.0 MB)
[K     |████████████████████████████████| 40.0 MB 28.2 MB/s eta 0:00:01
[?25hCollecting xxhash
  Downloading xxhash-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[K     |████████████████████████████████| 194 kB 16.9 MB/s eta 0:00:01
[?25hCollecting multiprocess<0.70.17
  Downloading multiprocess-0.70.16-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 19.5 MB/s eta 0:00:01
Collecting requests>=2.32.2
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 20.7 MB/s eta 0:00:01
[?25hCollecting fsspec[http]<=2024.9.0,>=2023.1.0
  Downloading fsspec-2024.9.0-py3-none-a

In [4]:
pip install --upgrade datasets pyarrow


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
import pandas as pd
from datasets import Dataset, DatasetDict

def format_messages_to_text(messages):
    """
    Convert messages to a specific text format for training
    Follows Llama 3 chat template conventions
    """
    formatted_text = ""
    for message in messages:
        if message['role'] == 'user':
            formatted_text += f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{message['content']}<|eot_id|>"
        elif message['role'] == 'assistant':
            formatted_text += f"<|start_header_id|>assistant<|end_header_id|>\n\n{message['content']}<|eot_id|>"
    formatted_text += "<|begin_of_text|>"
    return formatted_text

def prepare_dataset(json_path):
    # Load your data
    with open(json_path, "r") as f:
        data = json.load(f)

    # Convert to training format
    training_data = []
    for item in data:
        instruction = item.get("﻿instruction") or item.get("instruction")
        output = item["output"]
        
        # Skip if any field is missing
        if not instruction or not output:
            continue
        
        # Format for Llama-3 fine-tuning
        messages = [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": output}
        ]
        
        # Convert messages to formatted text
        text = format_messages_to_text(messages)
        training_data.append({"text": text})

    # Convert to Hugging Face dataset
    dataset = Dataset.from_list(training_data)

    # Split into train and validation (95% train, 5% validation)
    split_dataset = dataset.train_test_split(test_size=0.05)

    # Save dataset
    split_dataset.save_to_disk("processed_dataset")

    return split_dataset

# Run the preparation
dataset = prepare_dataset("/workspace/FINETUNING/dataset/ml_dataset.json")
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

  from .autonotebook import tqdm as notebook_tqdm
Saving the dataset (1/1 shards): 100%|██████████| 8981/8981 [00:00<00:00, 100568.25 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 473/473 [00:00<00:00, 57358.21 examples/s]

Train dataset size: 8981
Test dataset size: 473



