# データの連結／結合

## データの連結／結合方法と関数／メソッド

### DataFrameの連結

### DataFrameの結合

## concat() 関数によるDataFrameの連結

In [3]:
from IPython.display import display
import pandas as pd

df1 = pd.DataFrame(
    [
        [1, 10, 100],
        [2, 20, 200],
        [3, 30, 300],
    ],
    index=list("abc"),
    columns=list("ABC"),
)
df2 = pd.DataFrame(
    [
        [4, 10, 100],
        [2, 20, 200],
        [3, 30, 400],
    ],
    index=list("acd"),
    columns=list("ACD"),
)
display(df1, df2)

Unnamed: 0,A,B,C
a,1,10,100
b,2,20,200
c,3,30,300


Unnamed: 0,A,C,D
a,4,10,100
c,2,20,200
d,3,30,400


In [4]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,A,B,C,D
0,1,10.0,100,
1,2,20.0,200,
2,3,30.0,300,
3,4,,10,100.0
4,2,,20,200.0
5,3,,30,400.0


In [None]:
labeled_df = pd.concat([df1, df2], keys=["data1", "data2"])
labeled_df

## concat() 関数によるDataFrameの結合

In [None]:
pd.concat([df1, df2], axis=1)

In [None]:
pd.concat([df1, df2], axis=1, join="inner")

## join() メソッドによるDataFrameの結合

In [None]:
df3 = pd.DataFrame(
    [[1.3, 10.3], [2.3, 20.3]],
    index=list("ab"),
    columns=list("DE"),
)
display(df1, df2, df3)

In [None]:
df1.join(df3)

In [None]:
df1.join(
    df2,
    how="inner",
    lsuffix="_df1",
    rsuffix="_df2",
)

In [None]:
df4 = pd.DataFrame(
    [["a", 1.4, 10.4], ["b", 2.4, 20.4]], columns=["key", "F", "G"]
)
display(df4, df1)

In [None]:
df4.join(df1, on="key")

## merge() メソッドによるDataFrameの結合

In [None]:
pd.merge(df1, df2, on="A")

In [None]:
pd.merge(df1, df2, on="A", how="left")

In [None]:
pd.merge(df1, df2, left_on="C", right_on="D")

In [None]:
pd.merge(
    df1,
    df2,
    on="A",
    suffixes=("_df1", "_df2"),
)

In [None]:
pd.merge(
    df1,
    df2,
    on="A",
    how="outer",
    indicator=True,
)

In [None]:
pd.merge(
    df1,
    df2,
    how="cross",
    suffixes=("_df1", "_df2"),
)

In [None]:
pd.merge(
    df1,
    df2,
    how="outer",
    left_index=True,
    right_index=True,
)

In [None]:
import numpy as np

pd.merge(
    pd.DataFrame({"a": [np.nan, 1, 2]}),
    pd.DataFrame({"a": [1, 3, np.nan, np.nan]}),
)

### 重複したキーの検証

In [None]:
df5 = pd.DataFrame(
    [
        [1, 10, 100],
        [2, 20, 200],
        [2, 20, 200],
    ],
    index=list("abc"),
    columns=list("ABC"),
)
df5

In [None]:
pd.merge(df1, df5, on="A", validate="one_to_one")

In [None]:
pd.merge(
    df1,
    df5,
    on="A",
    validate="one_to_many",
)

### 近い値での結合

In [None]:
rng = np.random.default_rng(1)


def generate_date_range(size):
    return pd.date_range(
        "2022-01-01",
        freq="1s",
        periods=size,
    ) + pd.Series(rng.integers(0, 1000, size=size)).map(
        lambda x: pd.Timedelta(x, "ms")
    )


ts_df1 = pd.DataFrame(
    {
        "timestamp": generate_date_range(10),
        "data": rng.random(size=10),
    }
)
ts_df2 = pd.DataFrame(
    {
        "timestamp": generate_date_range(10),
        "data": rng.random(size=10),
    }
)
display(ts_df1, ts_df2)

In [None]:
pd.merge_asof(
    ts_df1,
    ts_df2,
    on="timestamp",
    tolerance=pd.Timedelta("2s"),
)