# 使用するデータの紹介と読み込み

## データ読み込み

In [None]:
# https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-pal.219.5 にあるコードを改変
import pandas as pd

uris = [
    # Adelie
    "https://pasta.lternet.edu/package/data/eml/knb-lter-pal/219/5/002f3893385f710df69eeebe893144ff",
    # Gentoo
    "https://pasta.lternet.edu/package/data/eml/knb-lter-pal/220/7/e03b43c924f226486f2f0ab6709d2381",
    # Chinstrap
    "https://pasta.lternet.edu/package/data/eml/knb-lter-pal/221/8/fe853aa8f7a59aa84cdd3197619ef462",
]


def read_data(uri):
    return pd.read_csv(
        uri,
        sep=",",
        quotechar='"',
        usecols=[  # ・・①
            "Species",
            "Island",
            "Individual ID",
            "Date Egg",
            "Culmen Length (mm)",
            "Culmen Depth (mm)",
            "Flipper Length (mm)",
            "Body Mass (g)",
            "Sex",
            "Comments",
        ],
        na_values=".",  # 追加で欠損値として扱う文字列を指定
    ).rename(  # ・・②
        columns={
            "Individual ID": "Individual_ID",
            "Date Egg": "Date_Egg",
            "Culmen Length (mm)": "Culmen_Length",
            "Culmen Depth (mm)": "Culmen_Depth",
            "Flipper Length (mm)": "Flipper_Length",
            "Body Mass (g)": "Body_Mass",
        }
    )


df = pd.concat([read_data(uri) for uri in uris], ignore_index=True)  # ・・③

## 読み込み結果の簡単なチェック

In [None]:
df.shape

In [None]:
# 先頭を表示
df.head()

In [None]:
# 最後を表示
df.tail()

In [None]:
df.dtypes

In [None]:
df["Date_Egg"] = pd.to_datetime(df.loc[:, "Date_Egg"], format="%Y-%m-%d")

In [None]:
df["Species"] = df.loc[:, "Species"].astype("category")
df["Island"] = df.loc[:, "Island"].astype("category")
df["Sex"] = df.loc[:, "Sex"].astype("category")

In [None]:
import re

RE_SHORT = re.compile(r"^\w+")  # 先頭から連続するUnicodeの単語文字にマッチ
df["Species_short"] = df.loc[:, "Species"].apply(
    lambda x: re.search(RE_SHORT, x)[0]
)

In [None]:
df.dtypes

In [None]:
df.loc[:, "Species_short"].cat.categories

## 読み込んだデータの保存

In [None]:
df.to_parquet("data/penguins.parquet")