In [1]:
%load_ext autoreload
%autoreload 2

from datetime import datetime

import numpy as np

import ray

# Create sample data with various types
data = [
    {
        "id": 1,
        "age": 25,
        "salary": 50000.0,
        "name": "Alice",
        "scores": [85, 90, 88],  # List type
        "embeddings": np.array([0.1, 0.2, 0.3, 0.4]),  # Tensor/array type
        "address": {"street": "123 Main St", "city": "NYC", "zip": 10001},  # Struct type
        "hire_date": datetime(2020, 1, 15),  # Timestamp type
        "metadata": b"binary_data_1",  # Binary type
    },
    {
        "id": 2,
        "age": 30,
        "salary": 60000.0,
        "name": "Bob",
        "scores": [92, 88, 95],
        "embeddings": np.array([0.5, 0.6, 0.7, 0.8]),
        "address": {"street": "456 Oak Ave", "city": "LA", "zip": 90001},
        "hire_date": datetime(2019, 3, 20),
        "metadata": b"binary_data_2",
    },
    {
        "id": 3,
        "age": None,  # Missing value
        "salary": None,
        "name": None,
        "scores": [78, 82, 80],
        "embeddings": np.array([0.9, 1.0, 1.1, 1.2]),
        "address": {"street": "789 Pine Rd", "city": None, "zip": None},
        "hire_date": None,
        "metadata": None,
    },
]




In [2]:
from ray.data.aggregate import Count, Max, Min, MissingValuePercentage
from ray.data.datatype import DataType

ds = ray.data.from_items(data)

# Custom aggregations for Temporal type
temporal_type_mapping = {
    DataType.temporal_(): [
        Count(ignore_nulls=False),
        Min(ignore_nulls=True),
        Max(ignore_nulls=True),
        MissingValuePercentage()
    ],
}


summary = ds.summary(override_dtype_agg_mapping=temporal_type_mapping)
summary

Usage stats collection is enabled by default for nightly wheels. To disable this, run the following command: `ray disable-usage-stats` before starting Ray. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.


2025-10-10 17:09:20,535	INFO worker.py:2004 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m
2025-10-10 17:09:21,314	INFO dataset.py:3366 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2025-10-10 17:09:21,319	INFO logging.py:293 -- Registered dataset logger for dataset dataset_2_0
2025-10-10 17:09:21,332	INFO hash_aggregate.py:180 -- Estimated memory requirement for aggregating aggregator (partitions=1, aggregators=1, dataset (estimate)=0.0GiB): shuffle=0.0MiB, output=0.0MiB, total=0.0MiB, 
2025-10-10 17:09:21,334	INFO streaming_executor.py:159 -- Starting execution of Dataset dataset_2_0. Full logs are in /tmp/ray/session_2025-10-10_17-09-19_252245_75259/logs/ray-data
2025-10-10 17:09:21,334	INFO streaming_executor.py:160 -- Execution plan of Dataset dataset_2_0: InputDataBuffer[Input] -> HashAggregateOperator[HashAggregate(key_columns=(), num_partitions=1)] -> LimitOperator[lim

Running 0: 0.00 row [00:00, ? row/s]

- HashAggregate(key_columns=(), num_partitions=1) 1: 0.00 row [00:00, ? row/s]

Shuffle 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Aggregation 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 4: 0.00 row [00:00, ? row/s]

[36m(_shuffle_block pid=77397)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.
[36m(_shuffle_block pid=77397)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.
2025-10-10 17:09:21,702	INFO streaming_executor.py:279 -- ✔️  Dataset dataset_2_0 execution finished in 0.37 seconds
2025-10-10 17:09:21,720	INFO logging.py:293 -- Registered dataset logger for dataset dataset_3_0
2025-10-10 17:09:21,721	INFO hash_aggregate.py:180 -- Estimated memory requirement for aggregating aggregator (partitions=1, aggregators=1, dataset (estimate)=0.0GiB): shuffle=0.0MiB, output=0.0MiB, total=0.0MiB, 
2025-10-10 17:09:21,723	INFO streaming_executor.py:159

Running 0: 0.00 row [00:00, ? row/s]

- HashAggregate(key_columns=(), num_partitions=1) 1: 0.00 row [00:00, ? row/s]

Shuffle 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Aggregation 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 4: 0.00 row [00:00, ? row/s]

2025-10-10 17:09:22,082	INFO streaming_executor.py:279 -- ✔️  Dataset dataset_3_0 execution finished in 0.36 seconds


DatasetSummary(
  schema_matching_stats: 7 rows × 10 columns
  schema_changing_stats: 5 rows × 10 columns
)

[36m(HashShuffleAggregator pid=77396)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.
[36m(HashShuffleAggregator pid=77396)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.
[36m(HashShuffleAggregator pid=77399)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.
[36m(HashShuffleAggregator pid=77399)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and

# COMBINED SUMMARY TABLE

In [3]:
summary.to_pandas()



Unnamed: 0,statistic,address,age,embeddings,hire_date,id,metadata,name,salary,scores
0,count,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
1,max,,30.0,,2020-01-15 00:00:00,3.0,,,60000.0,
2,mean,,27.5,,,2.0,,,55000.0,
3,min,,25.0,,2019-03-20 00:00:00,1.0,,,50000.0,
4,missing_pct,0.0,33.333333,0.0,33.0,0.0,33.0,33.0,33.333333,0.0
5,std,,2.5,,,0.816497,,,5000.0,
6,zero_pct,,0.0,,,0.0,,,0.0,


# HELPER FUNCTIONS

In [5]:
from enum import Enum

import pandas as pd

from ray.data.datatype import DataType
from ray.data.stats import DatasetSummary


class FeatureType(str, Enum):
    NUMERICAL = "numerical"
    VECTOR = "vector"
    CATEGORICAL = "categorical"


def to_feature_type_dataset(summary: DatasetSummary) -> dict[FeatureType, "pd.DataFrame"]:
    """Convert the dataset summary to a dictionary of feature type datasets.

    Args:
        summary: The DatasetSummary object from ds.summary()

    Returns:
        Dictionary mapping FeatureType to DataFrame, where each DataFrame contains
        only the columns and statistics for that feature type.
    """

    def classify_dtype(column_name: str) -> FeatureType:
        """Classify column by feature type using PyArrow type from schema-matching table.

        The schema-matching table contains all columns with their original types,
        so we can reliably infer the feature type from there.
        """
        # Get PyArrow type from schema-matching table (which has original types for all columns)
        pa_type = summary.schema_matching_stats.schema.field(column_name).type
        dtype = DataType.from_arrow(pa_type)

        # Use DataType's built-in methods
        if dtype.is_list_type():  # Handles lists, tensors, etc.
            return FeatureType.VECTOR
        elif dtype.is_string_type() or dtype.is_temporal_type():
            return FeatureType.CATEGORICAL
        elif dtype.is_numerical_type():
            return FeatureType.NUMERICAL
        else:
            return None

    # Get all column names (both tables have the same columns)
    all_columns = [name for name in summary.schema_matching_stats.schema.names if name != "statistic"]

    # Build a mapping of column -> feature_type
    column_to_feature_type = {}
    for col_name in all_columns:
        feature_type = classify_dtype(col_name)
        if feature_type is not None:
            column_to_feature_type[col_name] = feature_type

    # Get combined pandas DataFrame
    df = summary.to_pandas()

    # Group columns by feature type and create separate DataFrames
    result = {}
    for ft in FeatureType:
        # Find columns of this feature type
        cols_of_type = [col for col, ftype in column_to_feature_type.items() if ftype == ft]

        if not cols_of_type:
            continue

        # Select only the statistic column and columns of this feature type
        selected_cols = ["statistic"] + cols_of_type
        result[ft] = df[selected_cols].copy()

    return result

# CATEGORICAL FEATURES

In [6]:
summary_ds = ds.summary()
feature_types = to_feature_type_dataset(summary_ds)
feature_types[FeatureType.CATEGORICAL]

2025-10-10 17:09:45,048	INFO logging.py:293 -- Registered dataset logger for dataset dataset_5_0
2025-10-10 17:09:45,052	INFO hash_aggregate.py:180 -- Estimated memory requirement for aggregating aggregator (partitions=1, aggregators=1, dataset (estimate)=0.0GiB): shuffle=0.0MiB, output=0.0MiB, total=0.0MiB, 
2025-10-10 17:09:45,057	INFO streaming_executor.py:159 -- Starting execution of Dataset dataset_5_0. Full logs are in /tmp/ray/session_2025-10-10_17-09-19_252245_75259/logs/ray-data
2025-10-10 17:09:45,059	INFO streaming_executor.py:160 -- Execution plan of Dataset dataset_5_0: InputDataBuffer[Input] -> HashAggregateOperator[HashAggregate(key_columns=(), num_partitions=1)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- HashAggregate(key_columns=(), num_partitions=1) 1: 0.00 row [00:00, ? row/s]

Shuffle 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Aggregation 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 4: 0.00 row [00:00, ? row/s]

2025-10-10 17:09:45,363	INFO streaming_executor.py:279 -- ✔️  Dataset dataset_5_0 execution finished in 0.31 seconds
2025-10-10 17:09:45,373	INFO logging.py:293 -- Registered dataset logger for dataset dataset_6_0
2025-10-10 17:09:45,375	INFO hash_aggregate.py:180 -- Estimated memory requirement for aggregating aggregator (partitions=1, aggregators=1, dataset (estimate)=0.0GiB): shuffle=0.0MiB, output=0.0MiB, total=0.0MiB, 
2025-10-10 17:09:45,377	INFO streaming_executor.py:159 -- Starting execution of Dataset dataset_6_0. Full logs are in /tmp/ray/session_2025-10-10_17-09-19_252245_75259/logs/ray-data
2025-10-10 17:09:45,378	INFO streaming_executor.py:160 -- Execution plan of Dataset dataset_6_0: InputDataBuffer[Input] -> HashAggregateOperator[HashAggregate(key_columns=(), num_partitions=1)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- HashAggregate(key_columns=(), num_partitions=1) 1: 0.00 row [00:00, ? row/s]

Shuffle 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Aggregation 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 4: 0.00 row [00:00, ? row/s]

2025-10-10 17:09:45,667	INFO streaming_executor.py:279 -- ✔️  Dataset dataset_6_0 execution finished in 0.29 seconds


Unnamed: 0,statistic,hire_date,name
0,count,3.0,3.0
1,max,2020-01-15 00:00:00,
2,mean,,
3,min,2019-03-20 00:00:00,
4,missing_pct,33.0,33.0
5,std,,
6,zero_pct,,


# VECTOR FEATURES

In [7]:
feature_types[FeatureType.VECTOR]

Unnamed: 0,statistic,embeddings,scores
0,count,3.0,3.0
1,max,,
2,mean,,
3,min,,
4,missing_pct,0.0,0.0
5,std,,
6,zero_pct,,


# NUMERICAL FEATURES

In [8]:
# NUMERICAL FEATURES
feature_types[FeatureType.NUMERICAL]

Unnamed: 0,statistic,age,id,salary
0,count,3.0,3.0,3.0
1,max,30.0,3.0,60000.0
2,mean,27.5,2.0,55000.0
3,min,25.0,1.0,50000.0
4,missing_pct,33.333333,0.0,33.333333
5,std,2.5,0.816497,5000.0
6,zero_pct,0.0,0.0,0.0
