In [3]:
# --- Cell 1: Imports and Load Signal Profiles ---
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

PROFILES_PATH = Path("../data/catalog/parquet_signal_profiles.csv")
profiles = pd.read_csv(PROFILES_PATH)


In [4]:
# --- Cell 2: Quick Overview ---
print("Total signals:", len(profiles))
display(profiles.head(10))


Total signals: 1445


Unnamed: 0,file,signal,null_fraction,min,max,unique_count,enum_values,dtype
0,00000001_can1_can.parquet,BC_headLightLStatus_can1_can,0.948667,1.0,1.0,1,,float64
1,00000001_can1_can.parquet,BC_headLightRStatus_can1_can,0.948667,0.0,3.0,4,,float64
2,00000001_can1_can.parquet,BC_indicatorLStatus_can1_can,0.948667,0.0,3.0,4,,float64
3,00000001_can1_can.parquet,BC_indicatorRStatus_can1_can,0.948667,0.0,3.0,4,,float64
4,00000001_can1_can.parquet,BOOT_STATE_can1_can,0.948667,0.0,0.0,1,,float64
5,00000001_can1_can.parquet,CERRD_can1_can,0.948667,0.0,0.0,1,,float64
6,00000001_can1_can.parquet,DAS_bodyControlsChecksum_can1_can,0.988965,1.0,243.0,38,,float64
7,00000001_can1_can.parquet,DAS_bodyControlsCounter_can1_can,0.988965,0.0,15.0,16,,float64
8,00000001_can1_can.parquet,DAS_hazardLightRequest_can1_can,0.988965,0.0,0.0,1,,float64
9,00000001_can1_can.parquet,DAS_headlightRequest_can1_can,0.988965,1.0,1.0,1,,float64


In [5]:

# --- Cell 3: Set Filtering Thresholds (STANDARD!) ---
MAX_NULL_FRAC = 0.10      # e.g. keep only signals with <10% missing
MIN_UNIQUE = 1           # ignore 0/1 unique-value (dead signals)
MAX_ENUM_UNIQUE = 15      # anything with <=12 unique treated as enum


In [13]:
numeric_signals = profiles[
    (profiles["dtype"] == "float64") &
    (profiles["null_fraction"] <= MAX_NULL_FRAC)
]
print("Numeric signals (loose):", len(numeric_signals))
display(numeric_signals.head())


Numeric signals (loose): 0


Unnamed: 0,file,signal,null_fraction,min,max,unique_count,enum_values,dtype


In [11]:
print(profiles["dtype"].unique())


['float64']


In [14]:
# --- Cell 5: Filter for Enum Candidates ---
enum_signals = profiles[
    (profiles["unique_count"] <= MAX_ENUM_UNIQUE) &
    (profiles["null_fraction"] <= MAX_NULL_FRAC)
]
print("Enum signals (clean):", len(enum_signals))
display(enum_signals[["signal", "enum_values"]].head())

Enum signals (clean): 0


Unnamed: 0,signal,enum_values


In [15]:
# --- Cell 6: Plot Histograms for a Few Numeric Signals ---
for sig in numeric_signals["signal"].unique()[:3]:
    sample_file = profiles.loc[profiles["signal"] == sig, "file"].iloc[0]
    pq_path = Path("../data/interim") / sample_file
    df = pd.read_parquet(pq_path, columns=["time", sig])
    df[sig].hist(bins=50)
    plt.title(sig)
    plt.show()
