# データ操作とpandas

## pandasとテーブルデータ


In [None]:
import pandas as pd
df = pd.DataFrame([
    ["C001", "製造業", 30],
    ["C002", "サービス業", 100],
    ["C003", "サービス業", None],
    ["C004", "小売業", 50],
    ["C005", "製造業", 20]
], columns=["会社コード", "業種", "従業員数"]
)
df

## データ型


In [None]:
type(df)

In [None]:
df["会社コード"]

In [None]:
df.会社コード

In [None]:
from datetime import date
type(date(2020, 7, 1))

In [None]:
ts = pd.to_datetime(date(2020, 7, 1))
type(ts)

## データの確認


In [None]:
df.info()

## データ抽出・置換・代入


In [None]:
df[["会社コード", "業種"]]

In [None]:
df.loc[:, ["会社コード", "業種"]]

In [None]:
df.loc[:, "会社コード":"業種"]

In [None]:
df.iloc[:, [0, 1]]

In [None]:
df.iloc[:, 0:2]

In [None]:
df.iloc[:, :-1]

In [None]:
df.drop("従業員数", axis=1)

In [None]:
df.filter(regex="会社|業種")

In [None]:
df.iloc[:2, :]

In [None]:
df.iloc[:2, :2]

In [None]:
df[(df["業種"]!="製造業")&(df["従業員数"]>=30)]

In [None]:
df.loc[(df["業種"]!="製造業")&(df["従業員数"]>=30)]

In [None]:
df.query("業種 != '製造業' and 従業員数 >= 30")

In [None]:
df.query("業種 != '製造業' and 従業員数 >= 30")[["会社コード", "業種"]]

In [None]:
df.loc[(df["業種"]!="製造業")&(df["従業員数"]>=30),  ["会社コード", "業種"]]

In [None]:
df.assign(**{"フラグ": 0})

In [None]:
df_original = df.copy()  # 元の DataFrame を保持
df["フラグ"] = 0
df

In [None]:
print("元の df_original の id:", id(df_original))
print("元の df_original:\n", df_original)
# (1) 直接代入（in-place 更新）
df_inplace = df_original.copy()
print("\n(1)直接代入前の df_inplace の id:", id(df_inplace))
df_inplace["フラグ"] = 0
print("(1)直接代入後の df_inplace の id:", id(df_inplace))
print("(1)直接代入後の df_inplace:\n", df_inplace)
# (2) assign で新しい変数に代入
df_assign_before = df_original.copy()
print("\n(2)assign 前の df_assign_before の id:", id(df_assign_before))
df_assign_after = df_assign_before.assign(**{"フラグ": 0})
print("(2)assign の結果 df_assign_after の id:", id(df_assign_after))
print("(2)assign 後の df_assign_before (変更なし):\n", df_assign_before)
print("(2)assign の結果 df_assign_after:\n", df_assign_after)
# (3) assign で同じ変数に再代入
df_assign = df_original.copy()
print("\n(3)再代入前の df_assign の id:", id(df_assign))
df_assign = df_assign.assign(**{"フラグ": 0})
print("(3)再代入後の df_assign の id:", id(df_assign))
print("(3)再代入後の df_assign:\n", df_assign)

In [None]:
df["フラグ"].mask(df["従業員数"]>=30, 1)

In [None]:
df.loc[df["従業員数"]>=30, "フラグ"] = 1

In [None]:
df["フラグ"].where(df["従業員数"]>=30, -1)

In [None]:
df.replace("サービス業", "情報通信業")

In [None]:
df.rename(columns={"フラグ": "従業員30人以上フラグ"})

In [None]:
df.set_axis(
    ["会社コード", "業種", "従業員数", "従業員30人以上フラグ"],
    axis=1
)

## 欠測値とNull


In [None]:
df.isna()

In [None]:
df[df["従業員数"].notna()]

In [None]:
df = df.assign(**{"従業員数": df["従業員数"].fillna(100)})
df

## 重複とユニーク


In [None]:
df["業種"].unique()

In [None]:
df[["業種", "従業員数"]].duplicated(keep=False)

In [None]:
df.drop_duplicates(subset=["業種", "従業員数"])

## 分析しやすいデータ

### ワイドテーブルとロングテーブル

### 整然データ

### 機械判読可能なデータ


## データ整形


In [None]:
import numpy as np
df11 = pd.DataFrame([
    ["a", "x", 2],
    ["a", "y", 0],
    ["b", "x", np.nan],
    ["b", "y", -5]
], columns=["A", "X", "V"]
)

In [None]:
df11

In [None]:
df13 = df11.set_index(["A", "X"])
df13

In [None]:
df13.reset_index()

In [None]:
df12 = df11.set_index("A")
df12

In [None]:
df12.reset_index()

In [None]:
(df12
    .set_index(
        "X", 
        append=True
    )
)

In [None]:
df13.reset_index(level=1)

In [None]:
df23 = df13.unstack()
df23

In [None]:
df23.stack(future_stack=True)

In [None]:
df22 = (df23
    .set_axis(
        df23.columns.levels[1], 
        axis=1
    )
)
df22

In [None]:
df32 = df22.transpose()
df32

In [None]:
df22.T

In [None]:
df21 = df22.reset_index()
df21

In [None]:
(df11
    .pivot(
        columns="X",
        index="A",
        values="V"
    )
)

In [None]:
pd.melt(df22,
    value_name="V",
    ignore_index=False
)

In [None]:
df11.sort_values("V", ascending=False, na_position="last")

In [None]:
df11.nlargest(2, columns="V")

## データの結合


In [None]:
df1 = pd.DataFrame([
    ["a", 2],
    ["a", 0],
    ["b", 3],
    ["c", 2],
], columns=["A", "X"], index=range(1, 5))

df2 = pd.DataFrame([
    ["a", 1],
    ["b", 3],
    ["d", 1]
], columns=["A", "Y"])

In [None]:
df1.merge(df2, how="cross", suffixes=("左", "右"))

In [None]:
df1.merge(df2, on="A", how="outer")

In [None]:
df1.merge(df2, on="A", how="left")

In [None]:
df1.merge(df2, on="A")

In [None]:
pd.merge(df1, df2, on="A")

In [None]:
df1.merge(df2, left_on="X", right_index=True)

In [None]:
pd.concat([df1, df2])

In [None]:
pd.concat([df1, df2], join="inner")

In [None]:
pd.concat([df1, df2], axis=1)

In [None]:
pd.concat([df1, df2], axis=1, join="inner")

In [None]:
df1.merge(df2, left_index=True, right_index=True)

In [None]:
df1.merge(df2, left_index=True, right_index=True, how="outer")

In [None]:
pd.concat([df1, df2], ignore_index=True)

## データの読み込み

## データの読み込み

### pandasでのデータの読み書き


In [None]:
from pathlib import Path
current_dir = Path.cwd()
data_path = (current_dir / "data" / "ch02").resolve()
csv_data = pd.read_csv(data_path / "法人データ.csv", encoding="utf-8")
csv_data.iloc[:5, 1:7]

In [None]:
csv_data.iloc[:5, :5].to_csv(data_path / "サンプルデータ.csv")

### ファイル形式

### 文字コード
