# 🏷️ ML Benchmark Hubs: Quick Data Loading & First-Look EDA
* Sklearn
* Hugging Face
* Kaggle
* UCI
* OpenML

In [None]:
# Core
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1) Sklearn datasets 🌿
* Load: Iris
* Fetch: California Housing
* Synthetic: “Moons”

In [None]:
from sklearn.datasets import load_iris, fetch_california_housing, make_moons

# --- Iris (toy, tabular) ---
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
df_iris = pd.DataFrame(X_iris, columns=iris.feature_names)
df_iris["target"] = y_iris

print("Iris shape:", df_iris.shape)
display(df_iris.head())
print(df_iris.isna().sum().to_dict())



In [None]:
# --- California Housing (realistic, tabular) ---
cal = fetch_california_housing()
df_cal = pd.DataFrame(cal.data, columns=cal.feature_names)
df_cal["MedHouseVal"] = cal.target

print("\nCalifornia shape:", df_cal.shape)
display(df_cal.head())
print(df_cal.isna().sum().to_dict())


In [None]:
# --- Synthetic Moons (controlled patterns) ---
X_moon, y_moon = make_moons(n_samples=400, noise=0.15)
df_moon = pd.DataFrame(X_moon, columns=["x1","x2"])
df_moon["label"] = y_moon
print("\nMoons shape:", df_moon.shape)
display(df_moon.head())

# Quick visuals (optional)
plt.figure(figsize=(4,3))
plt.scatter(df_moon["x1"], df_moon["x2"], c=df_moon["label"])
plt.title("make_moons() scatter")
plt.tight_layout()
plt.show()


## 2) Hugging Face Datasets 🤗
IMDB sentiment: label balance + sample peek

In [None]:
# If needed:
# !pip install datasets

from datasets import load_dataset, load_dataset_builder
from collections import Counter

ds = load_dataset("imdb")

# Basic structure
print(ds)
print("Train rows:", ds["train"].num_rows, "Test rows:", ds["test"].num_rows)
print("Features:", load_dataset_builder("imdb").info.features)

# Label balance (train)
label_counts = Counter(ds["train"]["label"])
print("Label counts:", label_counts)

# Bar chart
plt.bar(["Negative","Positive"], [label_counts[0], label_counts[1]])
plt.title("IMDB Sentiment Distribution (Train)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Sample row
ds["train"][0]


## 3) Kaggle 📦
Delhi Air Quality via KaggleHub → first look

In [None]:
# If needed:
# !pip install "kagglehub[pandas-datasets]"

import kagglehub
from kagglehub import KaggleDatasetAdapter

# kagglehub.login()  # uncomment if required on your environment

handle = "kunshbhatia/delhi-air-quality-dataset"
file_in_dataset = "delhi_air_quality.csv"  # adjust if the filename differs

df_kaggle = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    handle,
    file_in_dataset,
)

print("Kaggle (Delhi Air) shape:", df_kaggle.shape)
display(df_kaggle.head())
print(df_kaggle.isna().sum().sort_values(ascending=False).head(10))


## 4) UCI Repository 🏛️
Heart Disease via ucimlrepo

In [None]:
# If needed:
# !pip install ucimlrepo

from ucimlrepo import fetch_ucirepo

heart = fetch_ucirepo(id=45)  # Heart Disease
X_uci = heart.data.features
y_uci = heart.data.targets

df_uci = X_uci.copy()
for c in y_uci.columns:
    df_uci[c] = y_uci[c]

print("UCI Heart shape:", df_uci.shape)
display(df_uci.head())
print("Targets:", list(y_uci.columns))
print(df_uci.isna().sum().sort_values(ascending=False).head(10))


## 5) OpenML 🌐
“irish” dataset, ID 451 → robust fetch & first look

In [None]:
# If needed:
# !pip install openml

import openml

d_irish = openml.datasets.get_dataset(451)  # "irish"
target_col = d_irish.default_target_attribute
X_irish, y_irish, cat_ind, names = d_irish.get_data(dataset_format="dataframe", target=target_col)

df_irish = X_irish.copy()
df_irish[target_col] = y_irish

print("OpenML 'irish' shape:", df_irish.shape)
display(df_irish.head())
print("Target:", target_col)
print("Categorical flags per feature:", dict(zip(names, cat_ind)))
print(df_irish.isna().sum().sort_values(ascending=False))

# Simple target distribution
df_irish[target_col].value_counts(dropna=False)


## (Mini) EDA add-ons 📋

In [None]:
# Correlation (numeric) — quick peek
num_cols = df_iris.select_dtypes(include=[np.number]).columns
corr = df_iris[num_cols].corr()
plt.figure(figsize=(6,5))
plt.imshow(corr, interpolation="nearest")
plt.title("Iris: Correlation (numeric)")
plt.colorbar()
plt.xticks(range(len(num_cols)), num_cols, rotation=90)
plt.yticks(range(len(num_cols)), num_cols)
plt.tight_layout()
plt.show()

# Simple group stats (if categorical exists)
if "target" in df_iris:
    print(df_iris.groupby("target")[num_cols].mean())
