pytorch · efenocchi · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -47,6 +47,7 @@ dev = [
     "pytest-integration",
     "tensorboard",
     "wandb",
+    "deeplake"
 ]
 
 [tool.setuptools.dynamic]

diff --git a/recipes/configs/llama2/7B_lora_single_device_deep_lake_raft.yaml b/recipes/configs/llama2/7B_lora_single_device_deep_lake_raft.yaml
@@ -0,0 +1,94 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Llama2 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --hf-token <HF_TOKEN>
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config llama2/7B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config 7B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama2.lora_llama2_7b
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+
+tokenizer:
+  _component_: torchtune.models.llama2.llama2_tokenizer
+  path: /tmp/Llama-2-7b-hf/tokenizer.model
+
+checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-2-7b-hf
+  checkpoint_files: [
+    pytorch_model-00001-of-00002.bin,
+    pytorch_model-00002-of-00002.bin
+  ]
+  adapter_checkpoint: null
+  recipe_checkpoint: null
+  output_dir: /tmp/Llama-2-7b-hf
+  model_type: LLAMA2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.raft_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 64
+compile: False
+
+# Logging
+output_dir: /tmp/lora_finetune_output
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Logging with W&B
+#metric_logger:
+#  _component_: torchtune.utils.metric_logging.WandBLogger
+#  # the W&B project to log to
+#  project: torchtune
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.utils.profiler
+  enabled: False
+  output_dir: ${output_dir}/torchtune_perf_tracing.json
diff --git a/recipes/configs/llama3/8B_lora_single_device_deep_lake_raft.yaml b/recipes/configs/llama3/8B_lora_single_device_deep_lake_raft.yaml
@@ -0,0 +1,91 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token <HF_TOKEN>
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config llama3/8B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config llama3/8B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3.lora_llama3_8b
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: llama3/original/tokenizer.model
+
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: llama3/original/
+  checkpoint_files: [
+    consolidated.00.pth
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Meta-Llama-3-8B/
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.raft_dataset
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 64
+compile: False
+
+# Logging
+output_dir: /tmp/lora_finetune_output
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+
+# Logging with WandBLogger
+#metric_logger:
+#  _component_: torchtune.utils.metric_logging.WandBLogger
+#  # the W&B project to log to
+#  project: torchtune
+# log_every_n_steps: null
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.utils.profiler
+  enabled: False
diff --git a/tests/torchtune/datasets/test_instruct_raft_dataset.py b/tests/torchtune/datasets/test_instruct_raft_dataset.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from unittest import mock
+
+from tests.test_utils import DummyTokenizer
+
+from torchtune.datasets import InstructDatasetDeepLakeRAFT
+
+
+def dummy_transform(sample):
+    sample["instruction"] = sample["instruction"] + " asdfghjkl; "
+    return sample
+
+
+class DummyTemplate:
+    def __init__(self, template):
+        self.template = template
+
+    def format(self, sample, column_map):
+        return self.template.format(**sample)
+
+
+class TestInstructDatasetDeepLakeRAFT:
+    template = DummyTemplate("Instruction:\n{instruction}\n\nResponse: ")
+    expected_tokenized_prompts = [
+        [0, 12, 4, 2, 3, 2, 12, 10, 9, 1, 5, 4, 4, 3, 6, 2, 4, -1]
+    ]
+
+    def get_samples(self):
+        return [
+            {
+                "instruction": "This is not an instruction.",
+                "cot_answer": "I never know what I'm doing, do you?",
+            },
+        ]
+
+    @mock.patch("torchtune.datasets._instruct_raft.load_deep_lake_dataset")
+    def test_get_item_train(self, mock_load_deep_lake_dataset):
+        mock_load_deep_lake_dataset.return_value = self.get_samples()
+        expected_labels = self.expected_tokenized_prompts
+
+        dataset = InstructDatasetDeepLakeRAFT(
+            tokenizer=DummyTokenizer(),
+            source="iam/agoofy/goober",
+            template=self.template,
+            transform=dummy_transform,
+        )
+        assert len(dataset) == 1
+        mock_load_deep_lake_dataset.assert_called_once()
+
+        prompt, label = dataset[0]
+        assert prompt == self.expected_tokenized_prompts[0]
+        assert label == expected_labels[0]
diff --git a/tests/torchtune/datasets/test_raft_dataset.py b/tests/torchtune/datasets/test_raft_dataset.py
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from unittest.mock import patch
+
+import pytest
+
+from tests.test_utils import get_assets_path
+from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
+from torchtune.datasets import raft_dataset
+from torchtune.modules.tokenizers import SentencePieceTokenizer
+
+
+class TestRAFTDataset:
+    @pytest.fixture
+    def tokenizer(self):
+        # m.model is a pretrained Sentencepiece model using the following command:
+        # spm.SentencePieceTrainer.train('--input=<TRAIN_FILE> --model_prefix=m --vocab_size=2000')
+        return SentencePieceTokenizer(str(get_assets_path() / "m.model"))
+
+    @patch("torchtune.datasets._instruct_raft.load_deep_lake_dataset")
+    def test_label_no_masking(self, load_deep_lake_dataset, tokenizer):
+        """
+        Test whether the input and the labels are correctly created when the input is not masked.
+        """
+
+        # mock the call to Deep Lake Datasets
+        load_deep_lake_dataset.return_value = [
+            {
+                "instruction": """<DOCUMENT> Artificial Intelligence (AI) is revolutionizing industries worldwide,
+                from healthcare to finance.By analyzing vast amounts of data, AI algorithms can detect patterns and
+                make predictions with unprecedented accuracy.</DOCUMENT><DOCUMENT>In the realm of autonomous vehicles,
+                AI plays a pivotal role in enabling safe and efficient navigation.
+                Through real-time data processing and machine learning, self-driving cars can adapt to diverse road
+                conditions and make split-second decisions.</DOCUMENT>
+                <DOCUMENT>AI-powered virtual assistants, like Siri and Alexa, have become ubiquitous in our daily lives.
+                These intelligent systems utilize natural language processing and machine learning algorithms
+                to understand and respond to user queries,simplifying tasks and enhancing user experience.</DOCUMENT>
+                What are some key applications of artificial intelligence across different industries?""",
+                "cot_answer": """##Reason: To answer the question about the applications of artificial intelligence
+                across different industries, we need to consider the provided context.
+                Here is the step-by-step reasoning:
+                Identify key examples of AI applications mentioned in the provided texts:
+                Healthcare: AI is used for data analysis, pattern recognition, and predictive modeling.
+                Autonomous vehicles: AI enables safe navigation and decision-making in self-driving cars.
+                Virtual assistants: AI powers intelligent systems like Siri and Alexa for natural language
+                processing and task automation.
+                Summarize the main industries benefiting from AI: Healthcare, transportation, and consumer electronics.
+                ##Answer: Some key applications of artificial intelligence across different industries
+                include healthcare analytics, autonomous vehicles for transportation,and virtual assistants
+                in consumer electronics.""",
+            }
+        ]
+
+        raft_ds = raft_dataset(tokenizer=tokenizer)
+        input, labels = raft_ds[0]
+
+        assert len(input) == len(labels)
+        assert labels[-1] == tokenizer.eos_id
+        assert input[0] == tokenizer.bos_id
+        assert CROSS_ENTROPY_IGNORE_IDX not in labels
diff --git a/tests/torchtune/utils/test_deep_lake_dataloader.py b/tests/torchtune/utils/test_deep_lake_dataloader.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from unittest.mock import patch
+
+import deeplake
+from torchtune.datasets import DeepLakeDataloader, load_deep_lake_dataset
+
+
+class TestDeepLakeDataloader:
+    @patch("deeplake.dataset")
+    def test_init(self, mock_dataset):
+        dl = DeepLakeDataloader(mock_dataset)
+        assert dl.ds == mock_dataset
+
+    def test_len(self):
+        ds = deeplake.dataset("test1", overwrite=True)
+        ds.create_tensor("id", htype="text", exist_ok=True)
+        ds.create_tensor("type", htype="text", exist_ok=True)
+        ds.append(
+            {
+                "id": "id1",
+                "type": "type1",
+            }
+        )
+        ds.append(
+            {
+                "id": "id2",
+                "type": "type2",
+            }
+        )
+        dl = DeepLakeDataloader(ds)
+        assert len(dl) == 2
+
+    def test_get_item(self):
+        ds = deeplake.dataset("test", overwrite=True)
+        ds.create_tensor("id", htype="text", exist_ok=True)
+        ds.create_tensor("type", htype="text", exist_ok=True)
+        ds.append(
+            {
+                "id": "id1",
+                "type": "type1",
+            }
+        )
+        ds.append(
+            {
+                "id": "id2",
+                "type": "type2",
+            }
+        )
+        dl = DeepLakeDataloader(ds)
+        assert dl[0] == {"id": "id1", "type": "type1"}
+        assert dl[1] == {"id": "id2", "type": "type2"}
+
+
+def test_load_deep_lake_dataset():
+    with patch("deeplake.dataset") as mock_dataset:
+        fake_ds = deeplake.dataset()
+        mock_dataset.return_value = fake_ds
+        dl = load_deep_lake_dataset("test", overwrite=True)
+        assert isinstance(dl, DeepLakeDataloader)
+        assert dl.ds == fake_ds
diff --git a/torchtune/data/__init__.py b/torchtune/data/__init__.py
@@ -19,6 +19,7 @@
     AlpacaInstructTemplate,
     GrammarErrorCorrectionTemplate,
     InstructTemplate,
+    RAFTInstructTemplate,
     StackExchangedPairedTemplate,
     SummarizeTemplate,
 )
@@ -27,6 +28,7 @@
 
 __all__ = [
     "AlpacaInstructTemplate",
+    "RAFTInstructTemplate",
     "ChatFormat",
     "CROSS_ENTROPY_IGNORE_IDX",
     "GrammarErrorCorrectionTemplate",