In [1]:
!pip install --quiet openml pandas scikit-learn matplotlib pyarrow

In [2]:
import openml, pandas as pd, numpy as np
from pathlib import Path

# -------- Paths (run this notebook from project/notebooks/) --------
OUT = Path("/Users/cw/MATH 533 - Final Project/data/processed/riboflavin.parquet")
OUT.parent.mkdir(parents=True, exist_ok=True)

# -------- Select dataset --------
# Option A: search by name and pick a did
df_list = openml.datasets.list_datasets(output_format="dataframe")
ribos = df_list[df_list['name'].str.contains('riboflavin', case=False, na=False)]
display(ribos[['did','name','NumberOfInstances','NumberOfFeatures']].head())

# Option B: set explicitly
did = int(ribos.iloc[0]['did']) if len(ribos) else 42092  # <- put your chosen did here

# -------- Download as DataFrame --------
dset = openml.datasets.get_dataset(did)
target_attr = dset.default_target_attribute  # may be None on some datasets

if not target_attr or (isinstance(target_attr, str) and target_attr.strip() == ""):
    raise ValueError("This dataset doesn’t declare a default target. Set target_attr manually.")

X, y, _, _ = dset.get_data(target=target_attr, dataset_format="dataframe")
df = X.copy()
df["target"] = y

# -------- Light cleaning --------
# Keep numeric columns
df = df.select_dtypes(include=["number"]).dropna(axis=0)

# Ensure target is numeric and last column
df["target"] = pd.to_numeric(df["target"], errors="coerce")
df = df.dropna(subset=["target"])
cols = [c for c in df.columns if c != "target"] + ["target"]
df = df[cols]

# Memory: make features float32
num_cols = df.columns.drop("target")
df[num_cols] = df[num_cols].astype("float32")

print("Final shape:", df.shape)
display(df.head(3))

# -------- Save processed file (Parquet preferred) --------
df.to_parquet(OUT)
# df.to_csv("../data/processed/dataset.csv", index=False)  # optional
print("Saved to:", OUT.resolve())

Unnamed: 0,did,name,NumberOfInstances,NumberOfFeatures
46983,46983,riboflavin,71.0,4089.0


Final shape: (71, 4089)


Unnamed: 0,AADK_at,AAPA_at,ABFA_at,ABH_at,ABNA_at,ABRB_at,ACCA_at,ACCB_at,ACCC_at,ACDA_at,...,YYDG_at,YYDH_at,YYDI_at,YYDJ_at,YYDK_at,YYXA_at,YYZB_at,YYZE_at,zur_at,target
0,8.492404,8.111451,8.320842,10.287112,8.261278,10.208273,9.745474,9.818821,9.676228,8.372442,...,5.440163,4.981684,6.481998,5.363919,7.773494,7.668711,3.762119,7.172823,7.427104,-6.643856
1,7.63938,7.239965,7.289051,9.862288,7.303497,9.500024,9.216008,9.854946,9.650078,7.732604,...,5.383739,4.597437,6.390127,4.763876,7.336261,7.600104,3.761616,7.187113,7.545287,-6.947862
2,8.088341,7.85551,7.793395,9.67672,7.098273,9.473917,9.580384,9.926077,9.787129,7.925034,...,5.718795,4.468228,6.123497,5.400686,7.573388,7.873352,3.741953,7.251285,7.718321,-7.93016


Saved to: /Users/cw/MATH 533 - Final Project/data/processed/riboflavin.parquet
