# 外れ値、異常値

## 外れ値の確認

### 箱ひげ図による外れ値の確認

In [None]:
import pandas as pd
import plotly.express as px

df = pd.read_parquet("data/penguins.parquet")
px.box(df, x="Species_short", y="Body_Mass")

### 分位点による外れ値の確認

In [None]:
df.groupby("Species_short")["Body_Mass"].quantile(
    q=[0, 0.01, 0.05, 0.95, 0.99, 1]
)

### 正規分布と$2\sigma$範囲による外れ値の確認

In [None]:
df_chinstrap = df.loc[df.loc[:, "Species_short"] == "Chinstrap", :]

px.histogram(df_chinstrap, x="Body_Mass")

In [None]:
import scipy.stats as stats

stats.shapiro(df_chinstrap.loc[:, "Body_Mass"])

In [None]:
sigma = df_chinstrap.loc[:, "Body_Mass"].std()  # 標準偏差
avg = df_chinstrap.loc[:, "Body_Mass"].mean()
print(f"下限値：{avg - 2 * sigma} 上限値：{avg + 2 * sigma}")

In [None]:
df_chinstrap.loc[:, "Body_Mass"].quantile(q=[0.025, 0.975])

## 外れ値への対処方法

### 上限値と下限値の設定

In [None]:
df_chinstrap = df_chinstrap.assign(
    Body_Mass_clipped=df_chinstrap.loc[:, "Body_Mass"].clip(
        lower=avg - 2 * sigma,
        upper=avg + 2 * sigma,
    )
)

In [None]:
px.histogram(
    df_chinstrap.melt(
        id_vars="Individual_ID",
        value_vars=["Body_Mass", "Body_Mass_clipped"],
    ),
    x="value",
    facet_col="variable",
)