In [1]:
%load_ext autoreload
%autoreload 2

from datetime import datetime

import numpy as np

import ray

# Create sample data with various types
data = [
    {
        "id": 1,
        "age": 25,
        "salary": 50000.0,
        "name": "Alice",
        "scores": [85, 90, 88],  # List type
        "embeddings": np.array([0.1, 0.2, 0.3, 0.4]),  # Tensor/array type
        "address": {"street": "123 Main St", "city": "NYC", "zip": 10001},  # Struct type
        "hire_date": datetime(2020, 1, 15),  # Timestamp type
        "metadata": b"binary_data_1",  # Binary type
    },
    {
        "id": 2,
        "age": 30,
        "salary": 60000.0,
        "name": "Bob",
        "scores": [92, 88, 95],
        "embeddings": np.array([0.5, 0.6, 0.7, 0.8]),
        "address": {"street": "456 Oak Ave", "city": "LA", "zip": 90001},
        "hire_date": datetime(2019, 3, 20),
        "metadata": b"binary_data_2",
    },
    {
        "id": 3,
        "age": None,  # Missing value
        "salary": None,
        "name": None,
        "scores": [78, 82, 80],
        "embeddings": np.array([0.9, 1.0, 1.1, 1.2]),
        "address": {"street": "789 Pine Rd", "city": None, "zip": None},
        "hire_date": None,
        "metadata": None,
    },
]




In [2]:
ds = ray.data.from_items(data)
ds.summary().to_pandas()

Usage stats collection is enabled by default for nightly wheels. To disable this, run the following command: `ray disable-usage-stats` before starting Ray. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.


2025-10-06 18:08:56,079	INFO worker.py:2004 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m
2025-10-06 18:08:56,765	INFO stats.py:259 -- Dtype ArrowTensorTypeV2(shape=(4,), dtype=double) for column embeddings is not a standard Arrow type, computing basic statistics only
2025-10-06 18:08:56,769	INFO dataset.py:3363 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2025-10-06 18:08:56,773	INFO logging.py:293 -- Registered dataset logger for dataset dataset_2_0
2025-10-06 18:08:56,785	INFO hash_aggregate.py:180 -- Estimated memory requirement for aggregating aggregator (partitions=1, aggregators=1, dataset (estimate)=0.0GiB): shuffle=0.0MiB, output=0.0MiB, total=0.0MiB, 
2025-10-06 18:08:56,787	INFO streaming_executor.py:159 -- Starting execution of Dataset dataset_2_0. Full logs are in /tmp/ray/session_2025-10-06_18-08-54_856737_11796/logs/ray-data
2025-10-06 18:08:56,787	INFO stream

Running 0: 0.00 row [00:00, ? row/s]

- HashAggregate(key_columns=(), num_partitions=1) 1: 0.00 row [00:00, ? row/s]

Shuffle 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Aggregation 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 4: 0.00 row [00:00, ? row/s]

2025-10-06 18:08:57,149	INFO streaming_executor.py:279 -- ✔️  Dataset dataset_2_0 execution finished in 0.36 seconds
2025-10-06 18:08:57,169	INFO logging.py:293 -- Registered dataset logger for dataset dataset_3_0
[36m(_shuffle_block pid=12728)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.
[36m(_shuffle_block pid=12728)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.


Unnamed: 0,column,dtype,count,mean,min,max,std,missing_pct,zero_pct
0,id,int64,3,2.0,1,3,0.816497,0.0,0.0
1,age,int64,3,27.5,25,30,2.5,33.333333,0.0
2,salary,double,3,55000.0,50000.0,60000.0,5000.0,33.333333,0.0
3,name,string,3,,,,,33.333333,
4,scores,list<item: int64>,3,,,,,0.0,
5,embeddings,"ArrowTensorTypeV2(shape=(4,), dtype=double)",3,,,,,0.0,
6,address,"struct<city: string, street: string, zip: int64>",3,,,,,0.0,
7,hire_date,timestamp[s],3,,2019-03-20 00:00:00,2020-01-15 00:00:00,,33.333333,
8,metadata,binary,3,,,,,33.333333,


[36m(HashShuffleAggregator pid=12727)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.
[36m(HashShuffleAggregator pid=12727)[0m Converting a 'D' precision datetime NumPy array to 's' precision Arrow timestamp. This conversion occurs because Arrow supports fewer precisions than Arrow and might result in a loss of precision or unrepresentable values.


# HELPER FUNCTIONS

In [3]:
from enum import Enum

import pandas as pd


class FeatureType(str, Enum):
    NUMERICAL = "numerical"
    VECTOR = "vector"
    CATEGORICAL = "categorical"


def to_feature_type_dataset(summary_ds: "pd.DataFrame") -> dict[FeatureType, "pd.DataFrame"]:
    """Convert the dataset summary to a dictionary of feature type datasets.

    Returns:
        Dictionary mapping FeatureType to Dataset, where each Dataset contains
        only the columns and statistics for that feature type.
    """
    def classify_dtype(dtype_str: str) -> FeatureType:
        if "list" in dtype_str or "fixed_size_list" in dtype_str or "ArrowTensorType" in dtype_str:
            return FeatureType.VECTOR
        elif "string" in dtype_str or "utf8" in dtype_str or "large_string" in dtype_str or "binary" in dtype_str:
            return FeatureType.CATEGORICAL
        elif "int" in dtype_str or "float" in dtype_str or "double" in dtype_str or "decimal" in dtype_str or "bool" in dtype_str:
            return FeatureType.NUMERICAL
        else:
            return None
    # Add feature_type column using with_column and UDF
    df = summary_ds.to_pandas()
    df["feature_type"] = df["dtype"].apply(classify_dtype)

    # Filter by feature type using column expressions
    return {
        ft: df[df["feature_type"] == ft.value]
        for ft in FeatureType
    }

# CATEGORICAL FEATURES

In [4]:
feature_types = to_feature_type_dataset(ds.summary())
feature_types[FeatureType.CATEGORICAL]

2025-10-06 18:08:57,249	INFO stats.py:259 -- Dtype ArrowTensorTypeV2(shape=(4,), dtype=double) for column embeddings is not a standard Arrow type, computing basic statistics only
2025-10-06 18:08:57,255	INFO logging.py:293 -- Registered dataset logger for dataset dataset_5_0
2025-10-06 18:08:57,260	INFO hash_aggregate.py:180 -- Estimated memory requirement for aggregating aggregator (partitions=1, aggregators=1, dataset (estimate)=0.0GiB): shuffle=0.0MiB, output=0.0MiB, total=0.0MiB, 
2025-10-06 18:08:57,263	INFO streaming_executor.py:159 -- Starting execution of Dataset dataset_5_0. Full logs are in /tmp/ray/session_2025-10-06_18-08-54_856737_11796/logs/ray-data
2025-10-06 18:08:57,263	INFO streaming_executor.py:160 -- Execution plan of Dataset dataset_5_0: InputDataBuffer[Input] -> HashAggregateOperator[HashAggregate(key_columns=(), num_partitions=1)] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- HashAggregate(key_columns=(), num_partitions=1) 1: 0.00 row [00:00, ? row/s]

Shuffle 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Aggregation 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- limit=1 4: 0.00 row [00:00, ? row/s]

2025-10-06 18:08:57,571	INFO streaming_executor.py:279 -- ✔️  Dataset dataset_5_0 execution finished in 0.31 seconds
2025-10-06 18:08:57,587	INFO logging.py:293 -- Registered dataset logger for dataset dataset_6_0


Unnamed: 0,column,dtype,count,mean,min,max,std,missing_pct,zero_pct,feature_type
3,name,string,3,,,,,33.333333,,FeatureType.CATEGORICAL
6,address,"struct<city: string, street: string, zip: int64>",3,,,,,0.0,,FeatureType.CATEGORICAL
8,metadata,binary,3,,,,,33.333333,,FeatureType.CATEGORICAL


# VECTOR FEATURES

In [5]:
feature_types[FeatureType.VECTOR]

Unnamed: 0,column,dtype,count,mean,min,max,std,missing_pct,zero_pct,feature_type
4,scores,list<item: int64>,3,,,,,0.0,,FeatureType.VECTOR
5,embeddings,"ArrowTensorTypeV2(shape=(4,), dtype=double)",3,,,,,0.0,,FeatureType.VECTOR


# NUMERICAL FEATURES

In [6]:
# NUMERICAL FEATURES
feature_types[FeatureType.NUMERICAL]

Unnamed: 0,column,dtype,count,mean,min,max,std,missing_pct,zero_pct,feature_type
0,id,int64,3,2.0,1.0,3.0,0.816497,0.0,0.0,FeatureType.NUMERICAL
1,age,int64,3,27.5,25.0,30.0,2.5,33.333333,0.0,FeatureType.NUMERICAL
2,salary,double,3,55000.0,50000.0,60000.0,5000.0,33.333333,0.0,FeatureType.NUMERICAL
