# データの連結／結合

## データの連結／結合方法と関数／メソッド

### DataFrameの連結

### DataFrameの結合

## concat() 関数によるDataFrameの連結

In [1]:
from IPython.display import display
import pandas as pd

df1 = pd.DataFrame(
    [
        [1, 10, 100],
        [2, 20, 200],
        [3, 30, 300],
    ],
    index=list("abc"),
    columns=list("ABC"),
)
df2 = pd.DataFrame(
    [
        [4, 10, 100],
        [2, 20, 200],
        [3, 30, 400],
    ],
    index=list("acd"),
    columns=list("ACD"),
)
display(df1, df2)

Unnamed: 0,A,B,C
a,1,10,100
b,2,20,200
c,3,30,300


Unnamed: 0,A,C,D
a,4,10,100
c,2,20,200
d,3,30,400


In [2]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,A,B,C,D
0,1,10.0,100,
1,2,20.0,200,
2,3,30.0,300,
3,4,,10,100.0
4,2,,20,200.0
5,3,,30,400.0


In [3]:
labeled_df = pd.concat([df1, df2], keys=["data1", "data2"])
labeled_df

Unnamed: 0,Unnamed: 1,A,B,C,D
data1,a,1,10.0,100,
data1,b,2,20.0,200,
data1,c,3,30.0,300,
data2,a,4,,10,100.0
data2,c,2,,20,200.0
data2,d,3,,30,400.0


## concat() 関数によるDataFrameの結合

In [4]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,A,B,C,A.1,C.1,D
a,1.0,10.0,100.0,4.0,10.0,100.0
b,2.0,20.0,200.0,,,
c,3.0,30.0,300.0,2.0,20.0,200.0
d,,,,3.0,30.0,400.0


In [5]:
pd.concat([df1, df2], axis=1, join="inner")

Unnamed: 0,A,B,C,A.1,C.1,D
a,1,10,100,4,10,100
c,3,30,300,2,20,200


## join() メソッドによるDataFrameの結合

In [6]:
df3 = pd.DataFrame(
    [[1.3, 10.3], [2.3, 20.3]],
    index=list("ab"),
    columns=list("DE"),
)
display(df1, df2, df3)

Unnamed: 0,A,B,C
a,1,10,100
b,2,20,200
c,3,30,300


Unnamed: 0,A,C,D
a,4,10,100
c,2,20,200
d,3,30,400


Unnamed: 0,D,E
a,1.3,10.3
b,2.3,20.3


In [7]:
df1.join(df3)

Unnamed: 0,A,B,C,D,E
a,1,10,100,1.3,10.3
b,2,20,200,2.3,20.3
c,3,30,300,,


In [8]:
df1.join(
    df2,
    how="inner",
    lsuffix="_df1",
    rsuffix="_df2",
)

Unnamed: 0,A_df1,B,C_df1,A_df2,C_df2,D
a,1,10,100,4,10,100
c,3,30,300,2,20,200


In [9]:
df4 = pd.DataFrame(
    [["a", 1.4, 10.4], ["b", 2.4, 20.4]], columns=["key", "F", "G"]
)
display(df4, df1)

Unnamed: 0,key,F,G
0,a,1.4,10.4
1,b,2.4,20.4


Unnamed: 0,A,B,C
a,1,10,100
b,2,20,200
c,3,30,300


In [10]:
df4.join(df1, on="key")

Unnamed: 0,key,F,G,A,B,C
0,a,1.4,10.4,1,10,100
1,b,2.4,20.4,2,20,200


## merge() メソッドによるDataFrameの結合

In [11]:
pd.merge(df1, df2, on="A")

Unnamed: 0,A,B,C_x,C_y,D
0,2,20,200,20,200
1,3,30,300,30,400


In [12]:
pd.merge(df1, df2, on="A", how="left")

Unnamed: 0,A,B,C_x,C_y,D
0,1,10,100,,
1,2,20,200,20.0,200.0
2,3,30,300,30.0,400.0


In [13]:
pd.merge(df1, df2, left_on="C", right_on="D")

Unnamed: 0,A_x,B,C_x,A_y,C_y,D
0,1,10,100,4,10,100
1,2,20,200,2,20,200


In [14]:
pd.merge(
    df1,
    df2,
    on="A",
    suffixes=("_df1", "_df2"),
)

Unnamed: 0,A,B,C_df1,C_df2,D
0,2,20,200,20,200
1,3,30,300,30,400


In [15]:
pd.merge(
    df1,
    df2,
    on="A",
    how="outer",
    indicator=True,
)

Unnamed: 0,A,B,C_x,C_y,D,_merge
0,1,10.0,100.0,,,left_only
1,2,20.0,200.0,20.0,200.0,both
2,3,30.0,300.0,30.0,400.0,both
3,4,,,10.0,100.0,right_only


In [16]:
pd.merge(
    df1,
    df2,
    how="cross",
    suffixes=("_df1", "_df2"),
)

Unnamed: 0,A_df1,B,C_df1,A_df2,C_df2,D
0,1,10,100,4,10,100
1,1,10,100,2,20,200
2,1,10,100,3,30,400
3,2,20,200,4,10,100
4,2,20,200,2,20,200
5,2,20,200,3,30,400
6,3,30,300,4,10,100
7,3,30,300,2,20,200
8,3,30,300,3,30,400


In [17]:
pd.merge(
    df1,
    df2,
    how="outer",
    left_index=True,
    right_index=True,
)

Unnamed: 0,A_x,B,C_x,A_y,C_y,D
a,1.0,10.0,100.0,4.0,10.0,100.0
b,2.0,20.0,200.0,,,
c,3.0,30.0,300.0,2.0,20.0,200.0
d,,,,3.0,30.0,400.0


In [18]:
import numpy as np

pd.merge(
    pd.DataFrame({"a": [np.nan, 1, 2]}),
    pd.DataFrame({"a": [1, 3, np.nan, np.nan]}),
)

Unnamed: 0,a
0,
1,
2,1.0


### 重複したキーの検証

In [19]:
df5 = pd.DataFrame(
    [
        [1, 10, 100],
        [2, 20, 200],
        [2, 20, 200],
    ],
    index=list("abc"),
    columns=list("ABC"),
)
df5

Unnamed: 0,A,B,C
a,1,10,100
b,2,20,200
c,2,20,200


In [20]:
pd.merge(df1, df5, on="A", validate="one_to_one")

MergeError: Merge keys are not unique in right dataset; not a one-to-one merge

In [21]:
pd.merge(
    df1,
    df5,
    on="A",
    validate="one_to_many",
)

Unnamed: 0,A,B_x,C_x,B_y,C_y
0,1,10,100,10,100
1,2,20,200,20,200
2,2,20,200,20,200


### 近い値での結合

In [22]:
rng = np.random.default_rng(1)


def generate_date_range(size):
    return pd.date_range(
        "2022-01-01",
        freq="1s",
        periods=size,
    ) + pd.Series(rng.integers(0, 1000, size=size)).map(
        lambda x: pd.Timedelta(x, "ms")
    )


ts_df1 = pd.DataFrame(
    {
        "timestamp": generate_date_range(10),
        "data": rng.random(size=10),
    }
)
ts_df2 = pd.DataFrame(
    {
        "timestamp": generate_date_range(10),
        "data": rng.random(size=10),
    }
)
display(ts_df1, ts_df2)

Unnamed: 0,timestamp,data
0,2022-01-01 00:00:00.473,0.423326
1,2022-01-01 00:00:01.511,0.827703
2,2022-01-01 00:00:02.755,0.409199
3,2022-01-01 00:00:03.950,0.549594
4,2022-01-01 00:00:04.034,0.027559
5,2022-01-01 00:00:05.144,0.753513
6,2022-01-01 00:00:06.822,0.538143
7,2022-01-01 00:00:07.948,0.329732
8,2022-01-01 00:00:08.249,0.788429
9,2022-01-01 00:00:09.311,0.303195


Unnamed: 0,timestamp,data
0,2022-01-01 00:00:00.124,0.750365
1,2022-01-01 00:00:01.453,0.280409
2,2022-01-01 00:00:02.976,0.485191
3,2022-01-01 00:00:03.134,0.980737
4,2022-01-01 00:00:04.383,0.961657
5,2022-01-01 00:00:05.403,0.72479
6,2022-01-01 00:00:06.903,0.541227
7,2022-01-01 00:00:07.203,0.276891
8,2022-01-01 00:00:08.502,0.160652
9,2022-01-01 00:00:09.262,0.969925


In [23]:
pd.merge_asof(
    ts_df1,
    ts_df2,
    on="timestamp",
    tolerance=pd.Timedelta("2s"),
)

Unnamed: 0,timestamp,data_x,data_y
0,2022-01-01 00:00:00.473,0.423326,0.750365
1,2022-01-01 00:00:01.511,0.827703,0.280409
2,2022-01-01 00:00:02.755,0.409199,0.280409
3,2022-01-01 00:00:03.950,0.549594,0.980737
4,2022-01-01 00:00:04.034,0.027559,0.980737
5,2022-01-01 00:00:05.144,0.753513,0.961657
6,2022-01-01 00:00:06.822,0.538143,0.72479
7,2022-01-01 00:00:07.948,0.329732,0.276891
8,2022-01-01 00:00:08.249,0.788429,0.276891
9,2022-01-01 00:00:09.311,0.303195,0.969925
