# データの分布の確認

## 統計的な確認

In [None]:
import pandas as pd

df = pd.read_parquet("data/penguins.parquet")
df.shape  # 行数と列数

In [None]:
df.loc[:, "Sex"].value_counts(dropna=False)

In [None]:
df.loc[:, "Sex"].value_counts(ascending=True, dropna=False)

In [None]:
df.loc[:, "Sex"].value_counts(sort=False, dropna=False)

In [None]:
df.loc[:, "Sex"].value_counts(normalize=True, dropna=False)

## ヒストグラムによる分布の確認

In [None]:
import plotly.express as px

px.histogram(df, x="Flipper_Length")

In [None]:
flipper_ranges = pd.cut(
    df.loc[:, "Flipper_Length"],
    bins=[0, 205, 300],
)
df.groupby(flipper_ranges)["Species_short"].value_counts()

In [None]:
px.histogram(
    df,
    x="Flipper_Length",
    color="Species_short",
    color_discrete_sequence=px.colors.qualitative.Dark2,
    pattern_shape="Species_short",
    opacity=0.7,
    barmode="overlay",
)

In [None]:
px.histogram(df, x="Flipper_Length", nbins=4)

In [None]:
import numpy as np

bin_edges = np.histogram_bin_edges(
    df.loc[:, "Flipper_Length"].dropna(),
    bins="auto",
)
print(len(bin_edges))  # 区間のしきい値の数（両端含む）

## 散布図による分布の確認

In [None]:
px.scatter(
    df,
    x="Culmen_Length",
    y="Flipper_Length",
    color="Species_short",
    color_discrete_sequence=px.colors.qualitative.Dark2,
    symbol="Species_short",
)

In [None]:
df.groupby("Species_short")[["Culmen_Length", "Flipper_Length"]].corr()