In [1]:
import datasource
import torch
import sklearn.preprocessing
import numpy as np
from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.train.torch import TorchCheckpoint


def train_loop_per_worker(config):
    def _import_class(name:str):
        import importlib
        index = name.rfind('.')
        module_name = name[:index] if index != -1 else '__main__'
        class_name = name[index + 1:]
        return getattr(importlib.import_module(module_name), class_name)
    
    model = config['model']
    criterion = config['criterion']
    optimizer = config['optimizer']
    
    train_data = session.get_dataset_shard('train_ds')
    test_data = session.get_dataset_shard('test_ds')

    index_X = 'FSR_for_force'
    index_y = 'force'

    model = _import_class(model)(input_size=len(data.loc[:, index_X].columns), output_size=len(data.loc[:, index_y].columns), **config['model_args'])
    criterion = _import_class(criterion)()
    optimizer = _import_class(optimizer)(model.parameters(), **config['optimizer_args'])

    while True:
        model.train()
        for X, y in train_data:
            pred = model(X)
            loss = criterion(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        model.eval()
        with torch.inference_mode():
            mae, mse, mape, num = [], [], [], []
            for X, y in test_data:
                pred = model(X)
                mae.append(sklearn.metrics.mean_absolute_error(y, pred))
                mse.append(sklearn.metrics.mean_squared_error(y, pred))
                mape.append(sklearn.metrics.mean_absolute_percentage_error(y, pred))
                num.append(len(y))
            mae = np.average(mae, weights=num)
            mse = np.average(mse, weights=num)
            mape = np.average(mape, weights=num)
            rmse = mse ** 0.5
        session.report(
            dict(rmse=rmse, mae=mae, mape=mape),
            checkpoint=Checkpoint.from_dict(
                dict(model=model.state_dict(), optimizer=optimizer.state_dict()),
            ),
        ),


In [2]:
import ray.data
import datasource
import numpy as np
data = datasource.get_data()
train_indexes, test_indexes = datasource.get_index_splited_by_time(data)
for i, train_index in enumerate(train_indexes):
    data.loc[train_index, 'group'] = i
train_ds = ray.data.from_items(np.concatenate([data.loc[train_index, ['FSR_for_force', 'force', 'group']].to_numpy() for train_index in train_indexes]))
for i, test_index in enumerate(test_indexes):
    data.loc[test_index, 'group'] = i
test_ds = ray.data.from_items(np.concatenate([data.loc[test_index, ['FSR_for_force', 'force', 'group']].to_numpy() for test_index in test_indexes]))
def split_column(batch):
    for i in range(13):
        batch[str(i)] = batch['item'][..., i]
        batch[str(i)] = batch['item'][..., i]
    batch.pop('item')
    return batch
def combine_column(batch):
    batch['X'] = np.hstack([np.expand_dims(batch[str(i)], 1) for i in range(6)])
    batch['y'] = np.hstack([np.expand_dims(batch[str(i)], 1) for i in range(6, 12)])
    batch['group'] = batch[str(12)]
    for i in range(13):
        batch.pop(str(i))
    return batch
def group_row(group):
    return {'X': [np.vstack(group['X'])], 'y': [np.vstack(group['y'])]}
from ray.data.preprocessors import SimpleImputer, StandardScaler, BatchMapper, Chain
split_mapper = BatchMapper(split_column, batch_format='numpy')
imputer = SimpleImputer(map(str, range(12)))
scaler = StandardScaler(map(str, range(12)))
combine_mapper = BatchMapper(combine_column, batch_format='numpy')
preprocessor = Chain(split_mapper, imputer, scaler, combine_mapper)

2023-07-04 16:07:41,617	INFO worker.py:1627 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m

Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m


In [3]:
from ray.air.config import ScalingConfig, RunConfig, CheckpointConfig
from ray.air.integrations.wandb import WandbLoggerCallback
from ray.train.torch import TorchTrainer
from ray.tune.stopper import TrialPlateauStopper, ExperimentPlateauStopper, CombinedStopper
from ray.data.preprocessors import Chain, SimpleImputer, MaxAbsScaler, MinMaxScaler, PowerTransformer, RobustScaler, StandardScaler

trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker, 
    train_loop_config={
        'model':'fsr_model.LSTM',
        'model_args':{
            'hidden_size':8,
            'num_layer':1,
        },
        'criterion':'torch.nn.MSELoss',
        'optimizer':'torch.optim.Adam',
        'optimizer_args':{
            'lr': 1e-3,
        },
        'scaler':'sklearn.preprocessing.StandardScaler',
    },
    scaling_config=ScalingConfig(
        num_workers=2,
        use_gpu=False,
    ),
    run_config=RunConfig(
        # callbacks=[
        #     WandbLoggerCallback(project='FSR-prediction'),
        # ],
        stop=CombinedStopper(
            TrialPlateauStopper(metric='rmse'),
            ExperimentPlateauStopper(metric='rmse'),
        ),
        checkpoint_config=CheckpointConfig(
            num_to_keep=3,
            checkpoint_score_attribute='rmse',
            checkpoint_score_order='min',
        ),
    ),
    datasets={
        'train':train_ds,
        'test':test_ds,
    },
    preprocessor=preprocessor,
)
result = trainer.fit()
print(f"Last result: {result.metrics}")

0,1
Current time:,2023-07-04 16:08:13
Running for:,00:00:25.31
Memory:,3.9/7.7 GiB

Trial name,# failures,error file
TorchTrainer_7b2f6_00000,1,/home/seokj/ray_results/TorchTrainer_2023-07-04_16-07-47/TorchTrainer_7b2f6_00000_0_2023-07-04_16-07-47/error.txt

Trial name,status,loc
TorchTrainer_7b2f6_00000,ERROR,172.26.215.93:1063285


(pid=1063285) - RandomizeBlockOrder 1:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) - Aggregate 2:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) SortSample 3:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) ShuffleMap 4:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) ShuffleReduce 5:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

[2m[36m(TorchTrainer pid=1063285)[0m 
[2m[36m(TorchTrainer pid=1063285)[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:52,165	INFO dataset.py:2087 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:52,169	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[RandomizeBlockOrder] -> AllToAllOperator[Aggregate]
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:52,169	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:52,169	INFO streaming

(pid=1063285) - RandomizeBlockOrder 1:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:57,492	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:57,492	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:57,492	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:58,433	INFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-5, stopped daemon 139818340173376)>.
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:59,878	INFO streaming_

(pid=1063285) - Aggregate 1:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) SortSample 2:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) ShuffleMap 3:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) ShuffleReduce 4:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:59,888	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[SimpleImputer] -> AllToAllOperator[Aggregate]
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:59,888	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:07:59,888	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


(pid=1063285) Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:08:01,111	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[SimpleImputer]
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:08:01,111	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:08:01,112	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:08:02,243	INFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-8, stopped daemon 139818348566080)>.
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:08:03,877	INFO streaming_executor.py:149 -- Shutting down <Strea

(pid=1063285) - RandomizeBlockOrder 1:   0%|          | 0/200 [00:00<?, ?it/s]

(pid=1063285) Running 0:   0%|          | 0/200 [00:00<?, ?it/s]

[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:08:08,855	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper->SimpleImputer->StandardScaler->BatchMapper] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:08:08,855	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:08:08,856	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(TorchTrainer pid=1063285)[0m 2023-07-04 16:08:11,591	INFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-10, stopped daemon 139818340173376)>.
2023-07-04 16:08:13,052	ERROR tune

Trial name,date,hostname,node_ip,pid,timestamp,trial_id
TorchTrainer_7b2f6_00000,2023-07-04_16-07-52,DESKTOP-0P789CI,172.26.215.93,1063285,1688454472,7b2f6_00000


2023-07-04 16:08:13,076	ERROR tune.py:1107 -- Trials did not complete: [TorchTrainer_7b2f6_00000]
2023-07-04 16:08:13,078	INFO tune.py:1111 -- Total run time: 25.37 seconds (25.30 seconds for the tuning loop).
- /home/seokj/ray_results/TorchTrainer_2023-07-04_16-07-47/TorchTrainer_7b2f6_00000_0_2023-07-04_16-07-47


TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: `trainer = TorchTrainer.restore("/home/seokj/ray_results/TorchTrainer_2023-07-04_16-07-47")`.
To start a new run that will retry on training failures, set `air.RunConfig(failure_config=air.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries.