# Custom Dataset

In this example, we will load a dataset from `scikit-learn` and use it to create a custom `Dataset` object in _Olympus_.

In [28]:
%pip install olymp
%pip install silence_tensorflow tensorflow tensorflow_probability

Note: you may need to restart the kernel to use updated packages.
Collecting silence_tensorflow
  Downloading silence_tensorflow-1.2.1.tar.gz (3.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting tensorflow
  Using cached tensorflow-2.11.0-cp39-cp39-win_amd64.whl (1.9 kB)
Collecting tensorflow_probability
  Downloading tensorflow_probability-0.19.0-py2.py3-none-any.whl (6.7 MB)
     ---------------------------------------- 6.7/6.7 MB 14.3 MB/s eta 0:00:00
Collecting support_developer
  Downloading support_developer-1.0.5.tar.gz (4.9 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting tensorflow-intel==2.11.0
  Using cached tensorflow_intel-2.11.0-cp39-cp39-win_amd64.whl (266.3 MB)
Collecting keras<2.12,>=2.11.0
  Using cached keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
Collecting astunparse>=1.6.0
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB

In [2]:
import pandas as pd
import numpy as np
from olympus import Dataset

[0;37m[INFO]  ... proceeding with pickle database
    [This message will be shown only once]
[0m

Traceback (most recent call last):
  File "c:\Users\sterg\Miniconda3\envs\sdl-demo\lib\site-packages\olympus\plotter\__init__.py", line 47, in <module>
    import seaborn
ModuleNotFoundError: No module named 'seaborn'
        Please install seaborn to use the plotter
    [This message will be shown only once]
[0m

In [3]:
import json
with open("secrets.json", "r") as f:
    secrets = json.load(f)

In [4]:
from uuid import uuid4
from self_driving_lab_demo import SelfDrivingLabDemoLight, mqtt_observe_sensor_data
from self_driving_lab_demo.utils.observe import get_paho_client

pico_id = secrets["SPARKS_LAB"]
sensor_topic = f"sdl-demo/picow/{pico_id}/as7341/"

paho_client = get_paho_client(sensor_topic)

session_id = f"benchmark-dev-{str(uuid4())[0:4]}"
print(f"Session ID: {session_id}")
sdl = SelfDrivingLabDemoLight(
    autoload=False,
    observe_sensor_data_fn=mqtt_observe_sensor_data,
    observe_sensor_data_kwargs=dict(
        pico_id=pico_id, session_id=session_id, client=paho_client,
    ),
)


Session ID: benchmark-dev-fde4


In [5]:
parameters = [
    {"name": "R", "type": "range", "bounds": [0, 89]},
    {"name": "G", "type": "range", "bounds": [0, 89]},
    {"name": "B", "type": "range", "bounds": [0, 89]},
]

In [6]:
from ax.service.ax_client import AxClient
from ax.modelbridge.factory import get_sobol
from random import shuffle

num_sobol = 2 ** 7
num_repeats = 5

client = AxClient()
client.create_experiment(parameters=parameters)
m = get_sobol(
    client.experiment.search_space, seed=10, fallback_to_sample_polytope=True
)
gr = m.gen(n=num_sobol)
sobol_points = [arm.parameters for arm in gr.arms]
sobol_points = sobol_points * num_repeats # stays flat
shuffle(sobol_points)  # operates inplace
sobol_points[0:3]

[INFO 01-09 19:13:45] ax.service.ax_client: Starting optimization with verbose logging. To disable logging, set the `verbose_logging` argument to `False`. Note that float values in the logs are rounded to 6 decimal points.
[INFO 01-09 19:13:45] ax.service.utils.instantiation: Inferred value type of ParameterType.INT for parameter R. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 01-09 19:13:45] ax.service.utils.instantiation: Inferred value type of ParameterType.INT for parameter G. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 01-09 19:13:45] ax.service.utils.instantiation: Inferred value type of ParameterType.INT for parameter B. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 01-09 19:13:45] ax.service.utils.instan

[{'R': 89, 'G': 22, 'B': 36},
 {'R': 27, 'G': 87, 'B': 84},
 {'R': 57, 'G': 16, 'B': 1}]

In [7]:
from tqdm.notebook import tqdm
results = [sdl.observe_sensor_data(point) for point in tqdm(sobol_points)]
    

  0%|          | 0/640 [00:00<?, ?it/s]

In [35]:
from os import path

savepath = path.join(
    "..", "data", "processed", "olympus-clslab-light-basic-dataset.csv"
)


In [36]:
param_df = pd.DataFrame(sobol_points)
result_df = pd.DataFrame(results)
cat_df = pd.concat([param_df, result_df], axis=1)
cat_df.to_csv(savepath, index=False)
cat_df.head(5)


Unnamed: 0,R,G,B,utc_timestamp,background,ch470,ch410,ch440,sd_card_ready,ch510,ch550,ch670,utc_time_str,onboard_temperature_K,encrypted_device_id_truncated,logged_to_mongodb,ch620,device_nickname,ch583
0,89,22,36,1673316830,"{'ch583': 32, 'ch670': 44, 'ch510': 75, 'ch410...",3500,345,4826,True,1380,754,419,2023-1-10 02:13:50,293.6404,6307014457,False,11975,clslab-light-mixing-sparks-lab,7056
1,27,87,84,1673316835,"{'ch583': 32, 'ch670': 44, 'ch510': 75, 'ch410...",11106,565,13474,True,8672,1456,822,2023-1-10 02:13:55,294.1085,6307014457,True,2754,clslab-light-mixing-sparks-lab,1710
2,57,16,1,1673316844,"{'ch583': 32, 'ch670': 44, 'ch510': 75, 'ch410...",1201,140,641,True,773,504,161,2023-1-10 02:14:04,294.1085,6307014457,True,6908,clslab-light-mixing-sparks-lab,4104
3,26,66,5,1673316854,"{'ch583': 32, 'ch670': 44, 'ch510': 75, 'ch410...",3822,159,781,True,5996,954,230,2023-1-10 02:14:14,294.1085,6307014457,True,2191,clslab-light-mixing-sparks-lab,1358
4,60,46,72,1673316864,"{'ch583': 32, 'ch670': 44, 'ch510': 75, 'ch410...",7841,500,11260,True,4111,1043,687,2023-1-10 02:14:24,294.1085,6307014457,True,7568,clslab-light-mixing-sparks-lab,4504


In [37]:
cat_df = pd.read_csv(savepath)
cat_df.head(5)

Unnamed: 0,R,G,B,utc_timestamp,background,ch470,ch410,ch440,sd_card_ready,ch510,ch550,ch670,utc_time_str,onboard_temperature_K,encrypted_device_id_truncated,logged_to_mongodb,ch620,device_nickname,ch583
0,89,22,36,1673316830,"{'ch583': 32, 'ch670': 44, 'ch510': 75, 'ch410...",3500,345,4826,True,1380,754,419,2023-1-10 02:13:50,293.6404,6307014457,False,11975,clslab-light-mixing-sparks-lab,7056
1,27,87,84,1673316835,"{'ch583': 32, 'ch670': 44, 'ch510': 75, 'ch410...",11106,565,13474,True,8672,1456,822,2023-1-10 02:13:55,294.1085,6307014457,True,2754,clslab-light-mixing-sparks-lab,1710
2,57,16,1,1673316844,"{'ch583': 32, 'ch670': 44, 'ch510': 75, 'ch410...",1201,140,641,True,773,504,161,2023-1-10 02:14:04,294.1085,6307014457,True,6908,clslab-light-mixing-sparks-lab,4104
3,26,66,5,1673316854,"{'ch583': 32, 'ch670': 44, 'ch510': 75, 'ch410...",3822,159,781,True,5996,954,230,2023-1-10 02:14:14,294.1085,6307014457,True,2191,clslab-light-mixing-sparks-lab,1358
4,60,46,72,1673316864,"{'ch583': 32, 'ch670': 44, 'ch510': 75, 'ch410...",7841,500,11260,True,4111,1043,687,2023-1-10 02:14:24,294.1085,6307014457,True,7568,clslab-light-mixing-sparks-lab,4504


In [38]:
df = cat_df.loc[:, ["R", "G", "B", "onboard_temperature_K"] + sdl.channel_names]
df

Unnamed: 0,R,G,B,onboard_temperature_K,ch410,ch440,ch470,ch510,ch550,ch583,ch620,ch670
0,89,22,36,293.6404,345,4826,3500,1380,754,7056,11975,419
1,27,87,84,294.1085,565,13474,11106,8672,1456,1710,2754,822
2,57,16,1,294.1085,140,641,1201,773,504,4104,6908,161
3,26,66,5,294.1085,159,781,3822,5996,954,1358,2191,230
4,60,46,72,294.1085,500,11260,7841,4111,1043,4504,7568,687
...,...,...,...,...,...,...,...,...,...,...,...,...
635,3,70,83,294.5767,495,13333,10100,6785,1196,448,637,738
636,16,50,78,294.5767,440,12403,8556,4538,956,824,1303,655
637,16,50,78,294.5767,439,12407,8556,4538,956,824,1303,655
638,24,67,54,294.5767,364,8069,7384,6295,1098,1296,2086,533


In [39]:
sdl.channel_names

['ch410', 'ch440', 'ch470', 'ch510', 'ch550', 'ch583', 'ch620', 'ch670']

In [40]:
# pass the Dataframe as the data argument for Dataset and specify which one is the target variable
dataset = Dataset(data=df, target_ids=sdl.channel_names)

  arr = asarray(arr)


Now `dataset` is an instance of the _Olympus_ class `Dataset`. However, before we can use it to train a custom `Emulator`, we need to specicify the parameter space for this dataset/problem.

In [41]:
from olympus import ParameterSpace, Parameter

# initialise a parameter space object
param_space = ParameterSpace()

# add all features in the dataset as a variable in the parameter space
for feature in dataset.features:
    low = np.min(dataset.data[feature])   # take the min in the data
    high = np.max(dataset.data[feature])  # take the max in the data
    param = Parameter(kind='continuous', name=feature, low=low, high=high)
    param_space.add(param)
    
dataset.set_param_space(param_space)

Note that in the above code we set the bounds of the parameters based on the min/max samples in the dataset. This can also be achieved by using the `infer_param_space` method of `Dataset`, as follows:

In [42]:
dataset.infer_param_space()

However, most often you will want these bounds to depend on the details your problem, in which case you can explicitly specify the bounds for all parameters.

Now we define a small Bayesian Neural Network and we will test its performance in emulating this dataset. Note that, by default, `Dataset` creates 5 random folds for cross validation and reserves 20% of the data for testing.

In [43]:
from olympus import Emulator
from olympus.models import BayesNeuralNet

mymodel = BayesNeuralNet(hidden_depth=2, hidden_nodes=12, hidden_act='leaky_relu', out_act="relu", 
                         batch_size=50, reg=0.005, max_epochs=10000)
emulator = Emulator(dataset=dataset, model=mymodel, feature_transform='normalize', target_transform='normalize')

In [44]:
emulator.train()

[0;37m[INFO] >>> Training model on 80% of the dataset, testing on 20%...
[0m

  loc = add_variable_fn(
  untransformed_scale = add_variable_fn(


[0m[0;37m[INFO]           Epoch       Train R2     Train RMSD        Test R2      Test RMSD
[0m

ValueError: could not broadcast input array from shape (50,8) into shape (50,1)

Let's now say you would like to share this dataset with the community by uploading it to the _Olympus Datasets_. You can do this with the `upload` command line tool in _Olympus_ as described in the documentation. However, you first need to prepare the dataset in the expected format. One way to easily do this is to use the `to_disk` method available to `Dataset` objects.

In [9]:
# save dataset to disk
dataset.to_disk('custom_dataset')

In [10]:
!ls custom_dataset/

config.json     data.csv        description.txt
