In [6]:
%load_ext autoreload
%autoreload 2

from datetime import datetime

import numpy as np

import ray

# Create sample data with various types
data = [
    {
        "id": 1,
        "age": 25,
        "salary": 50000.0,
        "name": "Alice",
        "scores": [85, 90, 88],  # List type
        "embeddings": np.array([0.1, 0.2, 0.3, 0.4]),  # Tensor/array type
        "address": {"street": "123 Main St", "city": "NYC", "zip": 10001},  # Struct type
        "hire_date": datetime(2020, 1, 15),  # Timestamp type
        "metadata": b"binary_data_1",  # Binary type
    },
    {
        "id": 2,
        "age": 30,
        "salary": 60000.0,
        "name": "Bob",
        "scores": [92, 88, 95],
        "embeddings": np.array([0.5, 0.6, 0.7, 0.8]),
        "address": {"street": "456 Oak Ave", "city": "LA", "zip": 90001},
        "hire_date": datetime(2019, 3, 20),
        "metadata": b"binary_data_2",
    },
    {
        "id": 3,
        "age": None,  # Missing value
        "salary": None,
        "name": None,
        "scores": [78, 82, 80],
        "embeddings": np.array([0.9, 1.0, 1.1, 1.2]),
        "address": {"street": "789 Pine Rd", "city": None, "zip": None},
        "hire_date": None,
        "metadata": None,
    },
]




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
from ray.data.aggregate import Count, Max, Min
from ray.data.datatype import DataType

ds = ray.data.from_items(data)

# Custom aggregations for Temporal type
temporal_type_mapping = {
    DataType.temporal_(): [
        Count(ignore_nulls=False),
        Min(ignore_nulls=True),
        Max(ignore_nulls=True),
    ],
}


summary = ds.summary(override_dtype_agg_mapping=temporal_type_mapping)
summary

2025-10-09 15:55:03,511	INFO logging.py:293 -- Registered dataset logger for dataset dataset_14_0


2025-10-09 15:55:03,513	INFO hash_aggregate.py:180 -- Estimated memory requirement for aggregating aggregator (partitions=1, aggregators=1, dataset (estimate)=0.0GiB): shuffle=0.0MiB, output=0.0MiB, total=0.0MiB, 
2025-10-09 15:55:03,518	INFO streaming_executor.py:159 -- Starting execution of Dataset dataset_14_0. Full logs are in /tmp/ray/session_2025-10-09_15-54-25_142878_51917/logs/ray-data
2025-10-09 15:55:03,519	INFO streaming_executor.py:160 -- Execution plan of Dataset dataset_14_0: InputDataBuffer[Input] -> HashAggregateOperator[HashAggregate(key_columns=(), num_partitions=1)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- HashAggregate(key_columns=(), num_partitions=1) 1: 0.00 row [00:00, ? row/s]

Shuffle 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Aggregation 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 4: 0.00 row [00:00, ? row/s]

2025-10-09 15:55:03,930	INFO streaming_executor.py:279 -- ✔️  Dataset dataset_14_0 execution finished in 0.41 seconds


MaterializedDataset(
   num_blocks=1,
   num_rows=9,
   schema={
      column: object,
      count: int64,
      max: object,
      mean: float64,
      min: object,
      missing_pct: float64,
      std: float64,
      zero_pct: float64
   }
)

# HELPER FUNCTIONS

In [8]:
from enum import Enum

import pandas as pd

from ray.data import Dataset, Schema
from ray.data.datatype import DataType


class FeatureType(str, Enum):
    NUMERICAL = "numerical"
    VECTOR = "vector"
    CATEGORICAL = "categorical"


def to_feature_type_dataset(summary_ds: "Dataset", dataset_schema: "Schema") -> dict[FeatureType, "pd.DataFrame"]:
    """Convert the dataset summary to a dictionary of feature type datasets.

    Args:
        summary_ds: The summary dataset (Ray Dataset or pandas DataFrame)
        dataset_schema: The original dataset schema (from ds.schema())

    Returns:
        Dictionary mapping FeatureType to DataFrame, where each DataFrame contains
        only the columns and statistics for that feature type.
    """

    def classify_dtype(column_name: str) -> FeatureType:
        """Classify column by feature type using Ray Data's DataType system."""
        # Get PyArrow type from schema and wrap in DataType
        pa_type = dataset_schema.base_schema.field(column_name).type
        if pa_type is None:
            return None

        dtype = DataType.from_arrow(pa_type)

        # Use DataType's built-in methods
        if dtype.is_list_type():  # Handles lists, tensors, etc.
            return FeatureType.VECTOR
        elif dtype.is_string_type() or dtype.is_temporal_type():
            return FeatureType.CATEGORICAL
        elif dtype.is_numerical_type():
            return FeatureType.NUMERICAL
        else:
            return None

    # Add feature_type column
    df = summary_ds.to_pandas() if not isinstance(summary_ds, pd.DataFrame) else summary_ds
    df["feature_type"] = df["column"].apply(classify_dtype)

    # Filter by feature type
    return {
        ft: df[df["feature_type"] == ft.value]
        for ft in FeatureType
        if ft.value in df["feature_type"].values
    }

In [9]:
ds.schema()

Column      Type
------      ----
id          int64
age         int64
salary      double
name        string
scores      list<item: int64>
embeddings  ArrowTensorTypeV2(shape=(4,), dtype=double)
address     struct<city: string, street: string, zip: int64>
hire_date   timestamp[s]
metadata    binary

# CATEGORICAL FEATURES

In [10]:
summary_ds = ds.summary()
feature_types = to_feature_type_dataset(summary_ds, ds.schema())
feature_types[FeatureType.CATEGORICAL]

2025-10-09 15:55:04,062	INFO logging.py:293 -- Registered dataset logger for dataset dataset_17_0
2025-10-09 15:55:04,064	INFO hash_aggregate.py:180 -- Estimated memory requirement for aggregating aggregator (partitions=1, aggregators=1, dataset (estimate)=0.0GiB): shuffle=0.0MiB, output=0.0MiB, total=0.0MiB, 
2025-10-09 15:55:04,066	INFO streaming_executor.py:159 -- Starting execution of Dataset dataset_17_0. Full logs are in /tmp/ray/session_2025-10-09_15-54-25_142878_51917/logs/ray-data
2025-10-09 15:55:04,066	INFO streaming_executor.py:160 -- Execution plan of Dataset dataset_17_0: InputDataBuffer[Input] -> HashAggregateOperator[HashAggregate(key_columns=(), num_partitions=1)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- HashAggregate(key_columns=(), num_partitions=1) 1: 0.00 row [00:00, ? row/s]

Shuffle 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Aggregation 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 4: 0.00 row [00:00, ? row/s]

2025-10-09 15:55:04,368	INFO streaming_executor.py:279 -- ✔️  Dataset dataset_17_0 execution finished in 0.30 seconds
2025-10-09 15:55:04,384	INFO logging.py:293 -- Registered dataset logger for dataset dataset_18_0


Unnamed: 0,column,count,max,mean,min,missing_pct,std,zero_pct,feature_type
3,name,3,,,,33.333333,,,FeatureType.CATEGORICAL
7,hire_date,3,2020-01-15 00:00:00,,2019-03-20 00:00:00,33.333333,,,FeatureType.CATEGORICAL


# VECTOR FEATURES

In [11]:
feature_types[FeatureType.VECTOR]

Unnamed: 0,column,count,max,mean,min,missing_pct,std,zero_pct,feature_type
4,scores,3,,,,0.0,,,FeatureType.VECTOR
5,embeddings,3,,,,0.0,,,FeatureType.VECTOR


# NUMERICAL FEATURES

In [12]:
# NUMERICAL FEATURES
feature_types[FeatureType.NUMERICAL]

Unnamed: 0,column,count,max,mean,min,missing_pct,std,zero_pct,feature_type
0,id,3,3.0,2.0,1.0,0.0,0.816497,0.0,FeatureType.NUMERICAL
1,age,3,30.0,27.5,25.0,33.333333,2.5,0.0,FeatureType.NUMERICAL
2,salary,3,60000.0,55000.0,50000.0,33.333333,5000.0,0.0,FeatureType.NUMERICAL
