In [1]:
import ray
ray.shutdown()
import os
os.environ["RAY_ML_DEV"] = "1"

In [None]:
SMOKE_TEST = 1

In [2]:
import ray
import torch
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import numpy as np

  from pandas import MultiIndex, Int64Index


In [3]:
dataset = load_dataset('imdb')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
num_classes = 2 # positive or negative sentiment

Reusing dataset imdb (/home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from torch.data.utils import Subset
dataset["train"], _ = random_split(dataset["train"], [55000, 5000])
dataset["test"], _ = random_split(dataset["test"], [55000, 5000])

In [4]:
ray_datasets = {
    "train": ray.data.from_torch(dataset["train"]),
    "val": ray.data.from_torch(dataset["test"]),
    "test": ray.data.from_torch(dataset["test"])
}

find: ‘.git’: No such file or directory
2023-03-27 13:54:09,152	INFO worker.py:1413 -- Connecting to existing Ray cluster at address: 10.0.18.215:6379...
2023-03-27 13:54:09,161	INFO worker.py:1601 -- Connected to Ray cluster. View the dashboard at https://console.anyscale-staging.com/api/v2/sessions/ses_lakhijrn6mdv9hqpld5trwzlxz/services?redirect_to=dashboard 
2023-03-27 13:54:09,740	INFO packaging.py:519 -- Creating a file package for local directory '/tmp/ray_tmp_module/ray'.
2023-03-27 13:54:10,649	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_c2c7e5e84d16379e.zip' (146.77MiB) to Ray cluster...
2023-03-27 13:54:13,159	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_c2c7e5e84d16379e.zip'.
2023-03-27 13:54:13,188	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_9090977ce244131b05e81db05f6fe15d.zip' (0.18MiB) to Ray cluster...
2023-03-27 13:54:13,191	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_909097

In [5]:
ray_datasets["train"].take(1)


Output 1:   0%|          | 0/200 [00:00<?, ?it/s]
output: 199 queued 1:   0%|          | 1/200 [00:00<00:00, 411.41it/s]
                                                                      

[{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far b

In [6]:
from ray.data.preprocessors import BatchMapper


def map_fn(batch):
    batch["text"] = [
        np.array(tokenizer.encode(text, padding="max_length", truncation=True, max_length=512))
        for text in batch["text"]
    ]
    return batch


preprocessor = BatchMapper(fn=map_fn, batch_format="pandas")


In [7]:
transfored_ds = preprocessor.fit_transform(ray_datasets["train"])

In [8]:
def convert_numpy_to_tensor(batch):
    # token_ids = torch.Tensor(batch["text"]).cuda()
    # labels = torch.Tensor(batch["label"]).cuda()
    token_ids = torch.LongTensor(batch["text"])
    labels = torch.LongTensor(batch["label"])
    return (token_ids, labels)



# for batch in transfored_ds.iter_torch_batches(batch_size=3, collate_fn=collate):
#     text, label = batch
#     print(text, type(text), text.shape)
#     print(label, type(label), label.shape)
#     break

In [9]:
# for batch in transfored_ds.iter_torch_batches(batch_size=3, collate_fn=convert_numpy_to_tensor):
#     print(batch)
#     input()

In [10]:
class SentimentModel(pl.LightningModule):
    def __init__(self, lr=2e-5, eps=1e-8):
        super().__init__()
        self.lr = lr
        self.eps = eps
        self.num_classes = 2
        self.model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=self.num_classes)

    def forward(self, x):
        outputs = self.model(x)
        logits = outputs.logits
        return logits

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = F.cross_entropy(logits.view(-1, self.num_classes), y)
        self.log('train_loss', loss, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = F.cross_entropy(logits.view(-1, self.num_classes), y)
        preds = torch.argmax(logits, dim=1)
        acc = torch.sum(preds == y).float() / len(preds)
        self.log('val_loss', loss, on_epoch=True)
        self.log('val_accuracy', acc, on_epoch=True)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr, eps=self.eps)


In [11]:
from ray.train.lightning import LightningTrainer, LightningConfigBuilder
from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig
import ray.tune as tune


lightning_config = (
    LightningConfigBuilder()
    .module(cls=SentimentModel, lr=tune.loguniform(1e-3, 1e-5), eps=1e-8)
    .trainer(max_epochs=3, accelerator="gpu")
    .build()
)

datasets_iter_config = {"batch_size": tune.choice([16, 32, 64]), "collate_fn": convert_numpy_to_tensor}

run_config = RunConfig(
    name="ptl-advanced-example",
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute="val_accuracy",
        checkpoint_score_order="max",
    ),
)

scaling_config = ScalingConfig(
    num_workers=2, use_gpu=True, resources_per_worker={"CPU": 1, "GPU": 1}
)

trainer = LightningTrainer(
    datasets=ray_datasets,
    scaling_config=scaling_config
)


2023-03-27 13:54:16,609	INFO instantiator.py:21 -- Created a temporary directory at /tmp/tmpue4psw2f
2023-03-27 13:54:16,610	INFO instantiator.py:76 -- Writing /tmp/tmpue4psw2f/_remote_module_non_scriptable.py


In [12]:
from ray.tune.schedulers import PopulationBasedTraining


mutation_lightning_config = (
    LightningConfigBuilder()
    .module(
        lr=tune.loguniform(1e-3, 1e-5),
    )
    .build()
)

tuner = tune.Tuner(
    trainer,
    param_space={"lightning_config": lightning_config, "datasets_iter_config": datasets_iter_config},
    run_config=ray.air.RunConfig(
        name="release-tuner-test",
        verbose=2,
        checkpoint_config=CheckpointConfig(
            num_to_keep=2,
            checkpoint_score_attribute="val_accuracy",
            checkpoint_score_order="max",
        ),
    ),
    tune_config=tune.TuneConfig(
        metric="val_accuracy",
        mode="max",
        num_samples=4,
        scheduler=PopulationBasedTraining(
            time_attr="training_iteration",
            hyperparam_mutations={"lightning_config": mutation_lightning_config},
            perturbation_interval=1,
        ),
    ),
)

In [13]:
# tuner.fit()

In [16]:
lightning_config = (
    LightningConfigBuilder()
    .module(cls=SentimentModel, lr=1e-5, eps=1e-8)
    .trainer(max_epochs=3, accelerator="gpu")
    .build()
)

datasets_iter_config = {"batch_size": 16, "collate_fn": convert_numpy_to_tensor}

single_trainer = LightningTrainer(
    lightning_config=lightning_config,
    datasets=ray_datasets,
    scaling_config=scaling_config,
    datasets_iter_config=datasets_iter_config,
    preprocessor=preprocessor,
)

In [None]:
result = single_trainer.fit()

In [18]:
result

Result(
  metrics={'_report_on': 'train_epoch_end', 'train_loss': 0.7121134400367737, 'train_loss_step': 0.6841170787811279, 'val_loss': 0.6932439208030701, 'val_accuracy': 0.5, 'train_loss_epoch': 0.7121134400367737, 'epoch': 2, 'step': 2346, 'should_checkpoint': True, 'done': True, 'trial_id': 'c0189_00000', 'experiment_tag': '0'},
  path='/home/ray/ray_results/LightningTrainer_2023-03-27_13-55-48/LightningTrainer_c0189_00000_0_2023-03-27_13-55-50',
  checkpoint=LightningCheckpoint(local_path=/home/ray/ray_results/LightningTrainer_2023-03-27_13-55-48/LightningTrainer_c0189_00000_0_2023-03-27_13-55-50/checkpoint_000002)
)