# Dataset Examples

This Jupyter Notebook demonstrates various use cases for the Dataset class, including:

1. Initializing an Empty Dataset and Adding Samples
2. Retrieving and Manipulating Samples from a Dataset
3. Performing Operations on the Dataset
4. Saving and Loading Datasets from directories or files

This notebook provides detailed examples of using the Dataset class to manage data, Samples, and information within a PLAID Dataset. It is intended for documentation purposes and familiarization with the PLAID library.

**Each section is documented and explained.**

In [None]:
# Import required libraries
from pathlib import Path
import numpy as np

In [None]:
# Import necessary libraries and functions
import Muscat.Containers.ElementsDescription as ElementsDescription
from Muscat.Bridges.CGNSBridge import MeshToCGNS
from Muscat.Containers import MeshCreationTools as MCT

import plaid
from plaid.containers.dataset import Dataset
from plaid.containers.sample import Sample

In [None]:
# Print dict util
def dprint(name: str, dictio: dict, end: str = "\n"):
    print(name, '{')
    for key, value in dictio.items():
	    print("    ", key, ':', value)

    print('}', end=end)

## Section 1: Initializing an Empty Dataset and Samples construction

This section demonstrates how to initialize an empty Dataset and handle Samples.

### Initialize an empty Dataset

In [None]:
print("#---# Empty Dataset")
dataset = Dataset()
print(f"{dataset=}")

### Create Sample

In [None]:
# Create Sample
points = np.array([
        [0.0, 0.0],
        [1.0, 0.0],
        [1.0, 1.0],
        [0.0, 1.0],
        [0.5, 1.5],
    ])

triangles = np.array([
        [0, 1, 2],
        [0, 2, 3],
        [2, 4, 3],
    ])

bars = np.array([
        [0, 1],
        [0, 2]
    ])

Mesh = MCT.CreateMeshOfTriangles(points, triangles)
elbars = Mesh.GetElementsOfType(ElementsDescription.Bar_2)
elbars.AddNewElements(bars, [1, 2])
cgns_mesh = MeshToCGNS(Mesh)

# Initialize an empty Sample
print("#---# Empty Sample")
sample_01 = Sample()
print(f"{sample_01 = }")

In [None]:
# Add a CGNS tree structure to the Sample
sample_01.add_tree(cgns_mesh)
print(f"{sample_01 = }")

In [None]:
# Add a scalar to the Sample
sample_01.add_scalar('rotation', np.random.randn())
print(f"{sample_01 = }")

### Print Sample general data

In [None]:
# Initialize another empty Sample
print("#---# Empty Sample")
sample_02 = Sample()
print(f"{sample_02 = }")

In [None]:
# Add a scalar to the second Sample
sample_02.add_scalar('rotation', np.random.randn())
print(f"{sample_02 = }")

### Display Sample CGNS tree

In [None]:
# Initialize a third empty Sample
print("#---# Empty Sample")
sample_03 = Sample()
sample_03.add_scalar('speed', np.random.randn())
sample_03.add_scalar('rotation', sample_01.get_scalar('rotation'))
sample_03.add_tree(cgns_mesh)

# Show Sample CGNS content
sample_03.show_tree()

In [None]:
# Add a field to the third empty Sample
sample_03.add_field('temperature', np.random.rand(5), "Zone", "Base_2_2")
sample_03.show_tree()

### Get Sample data

In [None]:
# Print sample general data
print(f"{sample_03 = }", end="\n\n")

# Print sample scalar data
print(f"{sample_03.get_scalar_names() = }")
print(f"{sample_03.get_scalar('speed') = }")
print(f"{sample_03.get_scalar('rotation') = }", end="\n\n")

# Print sample scalar data
print(f"{sample_03.get_field_names() = }")
print(f"{sample_03.get_field('temperature') = }")

## Section 2: Performing Operations on the Dataset

This section demonstrates how to add Samples to the Dataset, add information, and access data.

### Add Samples in the Dataset

In [None]:
# Add Samples by id in the Dataset
dataset.set_sample(id=0, sample=sample_01)
dataset.set_sample(1, sample_02)

# Add unique Sample and automatically create its id
added_sample_id = dataset.add_sample(sample_03)
print(f"{added_sample_id = }")

### Add and display information to the Dataset

In [None]:
# Add node information to the Dataset
dataset.add_info("legal", "owner", "Safran")

# Retrive dataset information
import json
dataset_info = dataset.get_infos()
print("dataset info =", json.dumps(dataset_info, sort_keys=False, indent=4), end="\n\n")

# Overwrite information (logger will display warnings)
infos = {"legal": {"owner": "Safran", "license": "CC0"}}
dataset.set_infos(infos)

# Retrive dataset information
dataset_info = dataset.get_infos()
print("dataset info =", json.dumps(dataset_info, sort_keys=False, indent=4), end="\n\n")

# Add tree information to the Dataset (logger will display warnings)
dataset.add_infos("data_description", {"number_of_samples" : 0, "number_of_splits": 0})

# Pretty print dataset information
dataset.print_infos()

### Get a list of specific Samples in a Dataset

In [None]:
get_samples_from_ids = dataset.get_samples(ids=[0, 1])
dprint("get samples from ids =", get_samples_from_ids)

### Get the list of Sample ids in a Dataset

In [None]:
# Print sample IDs
print("get_sample_ids =", dataset.get_sample_ids())

### Print Dataset general data

In [None]:
# Print the Dataset
print(f"{dataset = }")
print("length of dataset =", len(dataset))

### Add a list of Sample to a Dataset

In [None]:
# Create a new Dataset and add multiple samples
dataset = Dataset()
samples = [sample_01, sample_02, sample_03]
added_ids = dataset.add_samples(samples)
print(f"{added_ids = }")
print(f"{dataset = }")

### Access to Samples data through Dataset

In [None]:
# Access Sample data with indexes through the Dataset
print(f"{dataset(0) = }") # call strategy
print(f"{dataset[1] = }") # getitem strategy
print(f"{dataset[2] = }", end="\n\n")

print("scalar of the first sample = ", dataset[0].get_scalar_names())
print("scalar of the second sample = ", dataset[1].get_scalar_names())
print("scalar of the third sample = ", dataset[2].get_scalar_names())

In [None]:
# Access dataset information
print(f"{dataset[0].get_scalar('rotation') = }")
print(f"{dataset[1].get_scalar('rotation') = }")
print(f"{dataset[2].get_scalar('rotation') = }")

### Get Dataset scalars to tabular

In [None]:
# Print scalars in tabular format
print(f"{dataset.get_scalar_names() = }", end="\n\n")

dprint("get rotation scalar = ", dataset.get_scalars_to_tabular(['rotation']))
dprint("get speed scalar = ", dataset.get_scalars_to_tabular(['speed']), end="\n\n")

# Get specific scalars in tabular format
dprint("get specific scalars =", dataset.get_scalars_to_tabular(['speed', 'rotation']))
dprint("get all scalars =", dataset.get_scalars_to_tabular())

In [None]:
# Get specific scalars np.array
print("get all scalar arrays = ", dataset.get_scalars_to_tabular(as_nparray=True))

### Get Dataset fields

In [None]:
# Print fields in the Dataset
print("fields in the dataset = ", dataset.get_field_names())

## Section 3: Various operations on the Dataset

This section demonstrates operations like merging datasets, adding tabular scalars, and setting information.

### Initialize a Dataset with a list of Samples

In [None]:
# Create another Dataset
other_dataset = Dataset()
nb_samples = 3
samples = []
for _ in range(nb_samples):
    sample = Sample()
    sample.add_scalar('rotation', np.random.rand() + 1.0)
    sample.add_scalar('random_name', np.random.rand() - 1.0)
    samples.append(sample)

# Add a list of Samples
other_dataset.add_samples(samples)
print(f"{other_dataset = }")

### Merge two Datasets

In [None]:
# Merge the other dataset with the main dataset
print(f"before merge: {dataset = }")
dataset.merge_dataset(other_dataset)
print(f"after merge: {dataset = }", end="\n\n")

dprint("dataset scalars = ", dataset.get_scalars_to_tabular())

### Add tabular scalars to a Dataset

In [None]:
# Adding tabular scalars to the dataset
new_scalars = np.random.rand(3, 2)
dataset.add_tabular_scalars(new_scalars, names=['Tu', 'random_name'])

print(f"{dataset = }")
dprint("dataset scalars =", dataset.get_scalars_to_tabular())

### Set additional information to a dataset

In [None]:
infos = {
    "legal": {
        "owner": "Safran",
        "license": "CC0"},
    "data_production": {
        "type": "simulation",
        "simulator": "dummy"}
}
dataset.set_infos(infos)
dataset.print_infos()

## Section 4: Saving and Loading Dataset

This section demonstrates how to save and load a Dataset from a directory or file.

### Save a Dataset as a file tree

In [None]:
tmpdir = f'/tmp/test_safe_to_delete_{np.random.randint(1e10, 1e12)}'
print(f"Save dataset in: {tmpdir}")

dataset._save_to_dir_(tmpdir)

### Get the number of Samples that can be loaded from a directory

In [None]:
nb_samples = plaid.get_number_of_samples(tmpdir)
print(f"{nb_samples = }")

### Load a Dataset from a directory via initialization

In [None]:
loaded_dataset_from_init = Dataset(tmpdir)
print(f"{loaded_dataset_from_init = }")

multi_process_loaded_dataset = Dataset(tmpdir, processes_number=3)
print(f"{multi_process_loaded_dataset = }")

### Load a Dataset from a directory via the Dataset class

In [None]:
loaded_dataset_from_class = Dataset.load_from_dir(tmpdir)
print(f"{loaded_dataset_from_class = }")

multi_process_loaded_dataset = Dataset.load_from_dir(tmpdir, processes_number=3)
print(f"{multi_process_loaded_dataset = }")

### Load the dataset from a directory via a Dataset instance

In [None]:
loaded_dataset_from_instance = Dataset()
loaded_dataset_from_instance._load_from_dir_(tmpdir)

print(f"{loaded_dataset_from_instance = }")

multi_process_loaded_dataset = Dataset()
multi_process_loaded_dataset._load_from_dir_(tmpdir, processes_number=3)
print(f"{multi_process_loaded_dataset = }")

### Save the dataset to a TAR (Tape Archive) file

In [None]:
tmpdir = Path(f'/tmp/test_safe_to_delete_{np.random.randint(1e10,1e12)}')
tmpfile = tmpdir / 'test_file.plaid'

print(f"Save dataset in: {tmpfile}")
dataset.save(tmpfile)

### Load the dataset from a TAR (Tape Archive) file via Dataset instance

In [None]:
new_dataset = Dataset()
new_dataset.load(tmpfile)

print(f"{dataset = }")
print(f"{new_dataset = }")

### Load the dataset from a TAR (Tape Archive) file via initialization

In [None]:
new_dataset = Dataset(tmpfile)

print(f"{dataset = }")
print(f"{new_dataset = }")