https://docs.ray.io/en/latest/ray-air/getting-started.html 

In [1]:
import ray
from ray.data.preprocessors import Concatenator, Chain, StandardScaler
import numpy as np
import os

In [2]:
from ray_cluster_control import start_ray_cluster, stop_ray_cluster

Python-dotenv could not parse statement starting at line 4


In [None]:
start_ray_cluster()

In [None]:
ray.init('ray://{ray_head}-ray-head:10001'.format(ray_head=os.environ['RAY_CLUSTER_NAME']))

In [5]:
dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv")

In [6]:
train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)

In [7]:
tes_dataset = valid_dataset.drop_columns(cols=["target"])

Map_Batches: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it]


In [8]:
preprocessor = Chain(StandardScaler(columns= ["mean radius", "mean texture"]),Concatenator(exclude =["target"], dtype=np.float32))

In [9]:
import torch
import torch.nn as nn
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present

from ray import train
from ray.air import session
from ray.air.config import ScalingConfig
from ray.train.torch import TorchCheckpoint, TorchTrainer


In [10]:
def linear_model(input_features):
    return nn.Sequential(
        nn.Linear(in_features=input_features, out_features=16),
        nn.ReLU(),
        nn.Linear(16,16),
        nn.ReLU(),
        nn.Linear(16,1),
        nn.Sigmoid())

In [11]:
def train_loop_per_worker(config):
    
    batch_size = config["batch_size"]
    lr = config["lr"]
    epochs = config["num_epochs"]
    num_features = config["num_features"]
    
    train_data = train.get_dataset_shard("train")
    
    model = linear_model(num_features) 
    model = train.torch.prepare_model(model).to("cuda")
    
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=lr)
    
    for cur_epoch in range(epochs):
        for batch in train_data.iter_torch_batches(
            batch_size=batch_size, dtypes=torch.float32):
            
            inputs, labels = batch["concat_out"], batch["target"]
            optimizer.zero_grad()
            predictions = model(inputs.to("cuda"))
            train_loss = loss_fn(predictions, labels.unsqueeze(1).to("cuda"))
            train_loss.backward()
            optimizer.step()
        
        loss = train_loss.item()
        session.report({"loss":loss}, checkpoint=TorchCheckpoint.from_model(model)) 

In [12]:
num_features = len(train_dataset.schema().names) - 1

In [13]:
trainer = TorchTrainer(
    train_loop_per_worker= train_loop_per_worker,
    train_loop_config = {
    "batch_size": 128,
    "num_epochs": 20,
    "num_features": num_features,
    "lr": 0.001,
    },
    scaling_config = ScalingConfig(
        num_workers=1,
        use_gpu=True,
        trainer_resources={"CPU": 1}),
    datasets = {"train" : train_dataset},
    preprocessor=preprocessor,
)

In [None]:
result = trainer.fit()

In [None]:
print(f"Last result: {result.metrics}")

In [16]:
from ray import tune

In [17]:
param_space = {"train_loop_config": {"lr": tune.loguniform(0.0001, 0.02)}}
metric = "loss"

In [18]:
from ray.tune.tuner import Tuner, TuneConfig 
from ray.air.config import RunConfig

In [19]:
tuner = Tuner(
    trainer,
    param_space=param_space,
    tune_config=TuneConfig(num_samples=5, metric=metric, mode="min"),
)


In [None]:
result_grid = tuner.fit()

In [21]:
best_result = result_grid.get_best_result()
print("Best Result:", best_result)

Best Result: Result(metrics={'loss': 0.36218541860580444, '_timestamp': 1666013245, '_time_this_iter_s': 0.07092833518981934, '_training_iteration': 20, 'should_checkpoint': True, 'done': True, 'trial_id': '57c8f_00000', 'experiment_tag': '0_lr=0.0002'}, error=None, log_dir=PosixPath('/home/ray_results/TorchTrainer_2022-10-17_13-26-44/TorchTrainer_57c8f_00000_0_lr=0.0002_2022-10-17_13-26-44'))


In [None]:
stop_ray_cluster()