# 欠損値

## 欠損値の基礎知識### データ型

In [None]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(1)
float_ser = pd.Series(rng.random(4), index=range(0, 8, 2)).reindex(range(4))
float_ser

In [None]:
dt_ser = pd.Series(
    pd.date_range("2023-01-01", periods=4),
    index=range(0, 8, 2),
).reindex(range(4))
dt_ser

In [None]:
int_ser = pd.Series(
    rng.integers(0, 10, 4),
    index=range(0, 8, 2),
    dtype=pd.Int64Dtype(),
).reindex(range(4))
int_ser

In [None]:
int_ser.dtype

### データの型変換

In [None]:
# int型
pd.Series([1, None, 3])

In [None]:
# bool型
pd.Series([True, None, False])

In [None]:
# float型
pd.Series([1.0, None, 3.0])

In [None]:
# object型
pd.Series(["a", None, "c"])

### 欠損値を含むデータの評価

In [None]:
np.nan == np.nan

In [None]:
np.nan > np.nan

In [None]:
pd.NaT == pd.NaT

In [None]:
pd.NA == pd.NA

In [None]:
pd.isna(float_ser)
# or
pd.isnull(float_ser)
# or
float_ser.isna()
# or
float_ser.isnull()

In [None]:
int_ser.isna()

In [None]:
dt_ser.isna()

### 欠損値を含むデータの演算

In [None]:
int_ser.sum()

In [None]:
int_ser.cumsum()

In [None]:
int_ser.sum(skipna=False)

## 欠損値の発生パターン（メカニズム）と対処方法

### 欠損値の確認

In [None]:
df = pd.read_parquet("data/penguins.parquet")

In [None]:
print(df.isna().sum())

### 欠損値の発生が完全にランダム(MCAR)な場合

In [None]:
print(df.shape)

In [None]:
print(df.dropna(subset=["Culmen_Length"]).shape)

In [None]:
print(df.dropna(subset=["Culmen_Length", "Sex"]).shape)

In [None]:
print(df.dropna().shape)

In [None]:
print(df.dropna(axis=1).shape)

In [None]:
print(
    df.dropna(
        subset=["Culmen_Length", "Sex"],
        how="all",
    ).shape
)

In [None]:
df.loc[:, "Culmen_Length"].dropna()

### 欠損値の発生が何らかの原因による場合#### 単変量補完と多変量補完

In [None]:
# 種ごとの平均体重の確認
df.groupby("Species")["Body_Mass"].mean()

In [None]:
# 種ごとの欠損値の件数
df.groupby("Species")["Body_Mass"].agg(lambda x: x.isna().sum())

In [None]:
# 種ごとに平均体重で補完。
df.groupby("Species")["Body_Mass"].transform(lambda x: x.fillna(x.mean()))

### 行どうしに順序がある場合の補間

In [None]:
df.loc[:, "Body_Mass"].head()

In [None]:
df.loc[:, "Body_Mass"].fillna(method="ffill")

In [None]:
df.loc[:, "Body_Mass"].fillna(method="bfill")

In [None]:
df.loc[:, "Body_Mass"].interpolate(method="linear")

In [None]:
# "linear"の場合
pd.Series([1.0, None, 10], index=[1, 10, 100]).interpolate(method="linear")

In [None]:
# "index"の場合
pd.Series([1.0, None, 10], index=[1, 10, 100]).interpolate(method="index")

In [None]:
pd.Series(
    [1.0, None, 10.0],
    index=[
        pd.Timestamp("2023-01-01"),
        pd.Timestamp("2023-01-10"),
        pd.Timestamp("2023-04-10"),
    ],
).interpolate(method="time")