In [1]:
%load_ext autoreload
%autoreload 2

from datetime import datetime

import numpy as np

import ray

# Create sample data with various types
data = [
    {
        "id": 1,
        "age": 25,
        "salary": 50000.0,
        "name": "Alice",
        "scores": [85, 90, 88],  # List type
        "embeddings": np.array([0.1, 0.2, 0.3, 0.4]),  # Tensor/array type
        "address": {"street": "123 Main St", "city": "NYC", "zip": 10001},  # Struct type
        "hire_date": datetime(2020, 1, 15),  # Timestamp type
        "metadata": b"binary_data_1",  # Binary type
    },
    {
        "id": 2,
        "age": 30,
        "salary": 60000.0,
        "name": "Bob",
        "scores": [92, 88, 95],
        "embeddings": np.array([0.5, 0.6, 0.7, 0.8]),
        "address": {"street": "456 Oak Ave", "city": "LA", "zip": 90001},
        "hire_date": datetime(2019, 3, 20),
        "metadata": b"binary_data_2",
    },
    {
        "id": 3,
        "age": None,  # Missing value
        "salary": None,
        "name": None,
        "scores": [78, 82, 80],
        "embeddings": np.array([0.9, 1.0, 1.1, 1.2]),
        "address": {"street": "789 Pine Rd", "city": None, "zip": None},
        "hire_date": None,
        "metadata": None,
    },
]




In [2]:
ds = ray.data.from_items(data)
ds.summary().to_pandas()

Usage stats collection is enabled by default for nightly wheels. To disable this, run the following command: `ray disable-usage-stats` before starting Ray. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.


2025-10-06 23:51:46,668	INFO worker.py:2004 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m
2025-10-06 23:51:47,354	INFO dataset.py:3363 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2025-10-06 23:51:47,359	INFO logging.py:293 -- Registered dataset logger for dataset dataset_2_0
2025-10-06 23:51:47,370	INFO hash_aggregate.py:180 -- Estimated memory requirement for aggregating aggregator (partitions=1, aggregators=1, dataset (estimate)=0.0GiB): shuffle=0.0MiB, output=0.0MiB, total=0.0MiB, 
2025-10-06 23:51:47,372	INFO streaming_executor.py:159 -- Starting execution of Dataset dataset_2_0. Full logs are in /tmp/ray/session_2025-10-06_23-51-45_590193_51173/logs/ray-data
2025-10-06 23:51:47,372	INFO streaming_executor.py:160 -- Execution plan of Dataset dataset_2_0: InputDataBuffer[Input] -> HashAggregateOperator[HashAggregate(key_columns=(), num_partitions=1)] -> LimitOperator[lim

Running 0: 0.00 row [00:00, ? row/s]

- HashAggregate(key_columns=(), num_partitions=1) 1: 0.00 row [00:00, ? row/s]

Shuffle 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Aggregation 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 4: 0.00 row [00:00, ? row/s]

2025-10-06 23:51:47,723	INFO streaming_executor.py:279 -- ✔️  Dataset dataset_2_0 execution finished in 0.35 seconds
2025-10-06 23:51:47,742	INFO logging.py:293 -- Registered dataset logger for dataset dataset_3_0
[36m(HashShuffleAggregator pid=51249)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.
[36m(HashShuffleAggregator pid=51249)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.


Unnamed: 0,column,count,mean,min,max,std,missing_pct,zero_pct
0,id,3,2.0,1,3,0.816497,0.0,0.0
1,age,3,27.5,25,30,2.5,33.333333,0.0
2,salary,3,55000.0,50000.0,60000.0,5000.0,33.333333,0.0
3,name,3,,,,,33.333333,
4,scores,3,,,,,0.0,
5,embeddings,3,,,,,0.0,
6,address,3,,,,,0.0,
7,hire_date,3,,2019-03-20 00:00:00,2020-01-15 00:00:00,,33.333333,
8,metadata,3,,,,,33.333333,


[36m(HashShuffleAggregator pid=51246)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.
[36m(HashShuffleAggregator pid=51246)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.


# HELPER FUNCTIONS

In [None]:
from enum import Enum

import pandas as pd

from ray.data import Dataset, Schema

# from ray.data.extensions import (
#     ArrowTensorType,
#     ArrowTensorTypeV2,
#     ArrowVariableShapedTensorType,
# )
from ray.data.stats import _is_list_dtype, _is_numerical_dtype, _is_string_dtype


class FeatureType(str, Enum):
    NUMERICAL = "numerical"
    VECTOR = "vector"
    CATEGORICAL = "categorical"


def to_feature_type_dataset(summary_ds: "Dataset", dataset_schema: "Schema") -> dict[FeatureType, "pd.DataFrame"]:
    """Convert the dataset summary to a dictionary of feature type datasets.

    Args:
        summary_ds: The summary dataset (Ray Dataset or pandas DataFrame)
        dataset_schema: The original dataset schema (from ds.schema())

    Returns:
        Dictionary mapping FeatureType to DataFrame, where each DataFrame contains
        only the columns and statistics for that feature type.
    """

    def classify_dtype(column_name: str) -> FeatureType:
        """Classify column by feature type using PyArrow type checking."""
        dtype = dataset_schema.base_schema.field(column_name).type
        if dtype is None:
            return None

        # Check for tensor types or list types (vectors)
        if _is_list_dtype(dtype):
            return FeatureType.VECTOR
        elif _is_string_dtype(dtype):
            return FeatureType.CATEGORICAL
        elif _is_numerical_dtype(dtype):
            return FeatureType.NUMERICAL
        else:
            return None

    # Add feature_type column
    df = summary_ds.to_pandas() if not isinstance(summary_ds, pd.DataFrame) else summary_ds
    df["feature_type"] = df["column"].apply(classify_dtype)

    # Filter by feature type
    return {
        ft: df[df["feature_type"] == ft.value]
        for ft in FeatureType
        if ft.value in df["feature_type"].values
    }

In [4]:
ds.schema()

Column      Type
------      ----
id          int64
age         int64
salary      double
name        string
scores      list<item: int64>
embeddings  ArrowTensorTypeV2(shape=(4,), dtype=double)
address     struct<city: string, street: string, zip: int64>
hire_date   timestamp[s]
metadata    binary

# CATEGORICAL FEATURES

In [5]:
summary_ds = ds.summary()
feature_types = to_feature_type_dataset(summary_ds, ds.schema())
feature_types[FeatureType.CATEGORICAL]

2025-10-06 23:51:47,859	INFO logging.py:293 -- Registered dataset logger for dataset dataset_5_0
2025-10-06 23:51:47,862	INFO hash_aggregate.py:180 -- Estimated memory requirement for aggregating aggregator (partitions=1, aggregators=1, dataset (estimate)=0.0GiB): shuffle=0.0MiB, output=0.0MiB, total=0.0MiB, 
2025-10-06 23:51:47,866	INFO streaming_executor.py:159 -- Starting execution of Dataset dataset_5_0. Full logs are in /tmp/ray/session_2025-10-06_23-51-45_590193_51173/logs/ray-data
2025-10-06 23:51:47,867	INFO streaming_executor.py:160 -- Execution plan of Dataset dataset_5_0: InputDataBuffer[Input] -> HashAggregateOperator[HashAggregate(key_columns=(), num_partitions=1)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- HashAggregate(key_columns=(), num_partitions=1) 1: 0.00 row [00:00, ? row/s]

Shuffle 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Aggregation 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 4: 0.00 row [00:00, ? row/s]

2025-10-06 23:51:48,158	INFO streaming_executor.py:279 -- ✔️  Dataset dataset_5_0 execution finished in 0.29 seconds
2025-10-06 23:51:48,189	INFO logging.py:293 -- Registered dataset logger for dataset dataset_6_0


Unnamed: 0,column,count,mean,min,max,std,missing_pct,zero_pct,feature_type
3,name,3,,,,,33.333333,,FeatureType.CATEGORICAL


# VECTOR FEATURES

In [6]:
feature_types[FeatureType.VECTOR]

Unnamed: 0,column,count,mean,min,max,std,missing_pct,zero_pct,feature_type
4,scores,3,,,,,0.0,,FeatureType.VECTOR
5,embeddings,3,,,,,0.0,,FeatureType.VECTOR


# NUMERICAL FEATURES

In [7]:
# NUMERICAL FEATURES
feature_types[FeatureType.NUMERICAL]

Unnamed: 0,column,count,mean,min,max,std,missing_pct,zero_pct,feature_type
0,id,3,2.0,1.0,3.0,0.816497,0.0,0.0,FeatureType.NUMERICAL
1,age,3,27.5,25.0,30.0,2.5,33.333333,0.0,FeatureType.NUMERICAL
2,salary,3,55000.0,50000.0,60000.0,5000.0,33.333333,0.0,FeatureType.NUMERICAL
