# ================ Merging & Joining ================

In [2]:
import pandas as pd
import numpy as np

## ----------- CONCAT -----------
> Concatenate pandas objects along a particular axis

> pandas.concat(objs, *, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=None)

In [17]:
df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    },
    index=[0, 1, 2, 3],
)
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [18]:
df2 = pd.DataFrame(
    {
        "A": ["A4", "A5", "A6", "A7"],
        "B": ["B4", "B5", "B6", "B7"],
        "C": ["C4", "C5", "C6", "C7"],
        "D": ["D4", "D5", "D6", "D7"],
    },
    index=[4, 5, 6, 7],
)
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [19]:
df3 = pd.DataFrame(
    {
        "A": ["A8", "A9", "A10", "A11"],
        "B": ["B8", "B9", "B10", "B11"],
        "C": ["C8", "C9", "C10", "C11"],
        "D": ["D8", "D9", "D10", "D11"],
    },
    # index=[8, 9, 10, 11],
)
df3

Unnamed: 0,A,B,C,D
0,A8,B8,C8,D8
1,A9,B9,C9,D9
2,A10,B10,C10,D10
3,A11,B11,C11,D11


In [20]:
frames = [df1, df2, df3]
result = pd.concat(frames)
result

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
0,A8,B8,C8,D8
1,A9,B9,C9,D9


##### ❌❌ ValueError: cannot reindex on an axis with duplicate labels ❌❌
> cannot reindex on row-axis

In [23]:
# frames = [df1, df2, df3]
# result = pd.concat(frames).reindex(frames[0].index)
# result

In [None]:
df4 = pd.DataFrame(
    {
        "B": ["B2", "B3", "B6", "B7"],
        "D": ["D2", "D3", "D6", "D7"],
        "F": ["F2", "F3", "F6", "F7"],
    },
    index=[2, 3, 6, 7],
)
df4

Unnamed: 0,B,D,F
2,B2,D2,F2
3,B3,D3,F3
6,B6,D6,F6
7,B7,D7,F7


In [32]:
result = pd.concat([df1, df4])
result

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
2,,B2,,D2,F2
3,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


#### ignore_index:- ignores overlapping indexes
> Continuity of indexes made by generating new index values

> If True, do not use the index values along the concatenation axis. The resulting axis will be labeled 0, …, n - 1

In [34]:
result = pd.concat([df1, df4], ignore_index=True)
result

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
4,,B2,,D2,F2
5,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


In [None]:
result = pd.concat([df1, df4], axis=1)
result

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3
6,,,,,B6,D6,F6
7,,,,,B7,D7,F7


In [None]:
result = pd.concat([df1, df4])
result

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
2,,B2,,D2,F2
3,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


##### ❓❓JOIN❓❓

In [53]:
result = pd.concat([df1, df4], join="inner")
result

0
1
2
3
0
1


##### keys option:::: Construct hierarchical index using the passed keys as the outermost level

In [38]:
result = pd.concat([df1, df4], keys=['s1', 's2'])
result

  result = pd.concat([df1, df4], keys=['s1', 's2', 's3'])


Unnamed: 0,Unnamed: 1,A,B,C,D,F
s1,0,A0,B0,C0,D0,
s1,1,A1,B1,C1,D1,
s1,2,A2,B2,C2,D2,
s1,3,A3,B3,C3,D3,
s2,2,,B2,,D2,F2
s2,3,,B3,,D3,F3
s2,6,,B6,,D6,F6
s2,7,,B7,,D7,F7


In [43]:
result = pd.concat([df1, df4], axis=1, keys=['s1', 's2'])
result

Unnamed: 0_level_0,s1,s1,s1,s1,s2,s2,s2
Unnamed: 0_level_1,A,B,C,D,B,D,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3
6,,,,,B6,D6,F6
7,,,,,B7,D7,F7


##### names option:::: Names for the levels in the resulting hierarchical index

In [39]:
result = pd.concat([df1, df4], keys=['s1', 's2'],
          names=['Series name', 'Row ID'])
result

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D,F
Series name,Row ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
s1,0,A0,B0,C0,D0,
s1,1,A1,B1,C1,D1,
s1,2,A2,B2,C2,D2,
s1,3,A3,B3,C3,D3,
s2,2,,B2,,D2,F2
s2,3,,B3,,D3,F3
s2,6,,B6,,D6,F6
s2,7,,B7,,D7,F7


In [40]:
result = pd.concat([df1, df4], keys=['s1', 's2'],
          names=['Series name'])
result

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D,F
Series name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
s1,0,A0,B0,C0,D0,
s1,1,A1,B1,C1,D1,
s1,2,A2,B2,C2,D2,
s1,3,A3,B3,C3,D3,
s2,2,,B2,,D2,F2
s2,3,,B3,,D3,F3
s2,6,,B6,,D6,F6
s2,7,,B7,,D7,F7


In [46]:
result = pd.concat([df1, df4], axis=1,keys=['s1', 's2'],
          names=['Series name', 'Col_ID'])
result

Series name,s1,s1,s1,s1,s2,s2,s2
Col_ID,A,B,C,D,B,D,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3
6,,,,,B6,D6,F6
7,,,,,B7,D7,F7


##### ❌❌ ValueError: Length of names must match number of levels in MultiIndex ❌❌

In [42]:
# result = pd.concat([df1, df4], keys=['s1', 's2'],
#           names=['Series name', 'Row ID', 'try'])
# result

In [47]:
df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
                   columns=['animal', 'name'])
pd.concat([df1, df4], axis=1)

Unnamed: 0,A,B,C,D,animal,name
0,A0,B0,C0,D0,bird,polly
1,A1,B1,C1,D1,monkey,george
2,A2,B2,C2,D2,,
3,A3,B3,C3,D3,,


##### reindex()
> Allows only the specified df's indexes as labels

In [30]:
result = pd.concat([df1, df4], axis=1).reindex(df1.index)
result

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [31]:
result = pd.concat([df1, df4], axis=1).reindex(df2.index)
result

Unnamed: 0,A,B,C,D,B.1,D.1,F
4,,,,,,,
5,,,,,,,
6,,,,,B6,D6,F6
7,,,,,B7,D7,F7


## ----------- MERGE -----------
> Similar to SQL joins

> --> Works on common columns or indexes

> pandas.concat(objs, *, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=None)

In [96]:
df_left = pd.DataFrame({"ID": [1,2,3], "Name": ["A","B","C"]})
df_left

Unnamed: 0,ID,Name
0,1,A
1,2,B
2,3,C


In [97]:
df_right = pd.DataFrame({"ID": [1,2,4], "Score": [90,85,88]})
df_right

Unnamed: 0,ID,Score
0,1,90
1,2,85
2,4,88


In [56]:
# Default: inner join on common column
pd.merge(df_left, df_right, on="ID")

Unnamed: 0,ID,Name,Score
0,1,A,90
1,2,B,85


In [None]:
# Self join
pd.merge(df_left, df_left, on="ID")

Unnamed: 0,ID,Name_x,Name_y
0,1,A,A
1,2,B,B
2,3,C,C


In [57]:
# Left join
pd.merge(df_left, df_right, on="ID", how="left")

Unnamed: 0,ID,Name,Score
0,1,A,90.0
1,2,B,85.0
2,3,C,


In [58]:
# Right join
pd.merge(df_left, df_right, on="ID", how="right")

Unnamed: 0,ID,Name,Score
0,1,A,90
1,2,B,85
2,4,,88


In [59]:
# Outer join
pd.merge(df_left, df_right, on="ID", how="outer")

Unnamed: 0,ID,Name,Score
0,1,A,90.0
1,2,B,85.0
2,3,C,
3,4,,88.0


##### indicator=True ::::: Indicator Column
> Shows where each row came from

In [98]:
pd.merge(df_left, df_right, on="ID", how="outer", indicator=True)

Unnamed: 0,ID,Name,Score,_merge
0,1,A,90.0,both
1,2,B,85.0,both
2,3,C,,left_only
3,4,,88.0,right_only


##### Merge on multiple columns

In [114]:
df1 = pd.DataFrame({
    "ID":[1,2,3,4,5],
    "Dept":["HR","HR","IT","IT","IT"],
    "Salary":[100,200,300,400,500]
})
df1

Unnamed: 0,ID,Dept,Salary
0,1,HR,100
1,2,HR,200
2,3,IT,300
3,4,IT,400
4,5,IT,500


In [115]:
df2 = pd.DataFrame({
    "ID":[1,2,3,4],
    "Dept":["HR","HR","IT","Finance"],
    "Bonus":[10,20,30,40]
})
df2

Unnamed: 0,ID,Dept,Bonus
0,1,HR,10
1,2,HR,20
2,3,IT,30
3,4,Finance,40


In [112]:
pd.merge(df1, df2, on=["ID","Dept"], how="inner")

Unnamed: 0,ID,Dept,Salary,Bonus
0,1,HR,100,10
1,2,HR,200,20
2,3,IT,300,30


❓❓❓❓

##### validate="..." ::::: Merge with Validation
> Ensures relationship type (one-to-one, many-to-one, etc.)
- "1:1"
- "1:m"
- "m:1"
- "m:m"
- "one_to_one"
- "one_to_many"
- "many_to_one"
- "many_to_many"

In [120]:
pd.merge(df1, df2, on="ID", validate="one_to_one")

Unnamed: 0,ID,Dept_x,Salary,Dept_y,Bonus
0,1,HR,100,HR,10
1,2,HR,200,HR,20
2,3,IT,300,IT,30
3,4,IT,400,Finance,40


In [124]:
pd.merge(df1, df2, on="ID", validate="m:m")

Unnamed: 0,ID,Dept_x,Salary,Dept_y,Bonus
0,1,HR,100,HR,10
1,2,HR,200,HR,20
2,3,IT,300,IT,30
3,4,IT,400,Finance,40


##### suffixes:- To specify suffix

In [81]:
df1 = pd.DataFrame({
    "id": [1, 2, 3],
    "value": ["A", "B", "C"]
})
df1

Unnamed: 0,id,value
0,1,A
1,2,B
2,3,C


In [82]:
df2 = pd.DataFrame({
    "id": [2, 3, 4],
    "value": ["X", "Y", "Z"]
})
df2

Unnamed: 0,id,value
0,2,X
1,3,Y
2,4,Z


In [83]:
pd.merge(df1, df2, on="id", suffixes=("_left", "_right"))

Unnamed: 0,id,value_left,value_right
0,2,B,X
1,3,C,Y


##### if column names differ

In [67]:
df_right2 = pd.DataFrame({"num": [1,2,4], "Score": [90,85,88]})
df_right2

Unnamed: 0,num,Score
0,1,90
1,2,85
2,4,88


In [69]:
pd.merge(df_left, df_right2, left_on="ID", right_on="num", how="outer")

Unnamed: 0,ID,Name,num,Score
0,1.0,A,1.0,90.0
1,2.0,B,2.0,85.0
2,3.0,C,,
3,,,4.0,88.0


## ----------- JOIN -----------
> Shortcut for merging on index

In [84]:
df1 = pd.DataFrame({"Name": ["A","B","C"]}, index=[1,2,3])
df1

Unnamed: 0,Name
1,A
2,B
3,C


In [85]:
df2 = pd.DataFrame({"Score": [90,80,70]}, index=[2,3,4])
df2

Unnamed: 0,Score
2,90
3,80
4,70


##### left_index | right_index = True ::::: To join based on labels

In [86]:
df1.merge(df2, left_index=True, right_index=True, how="left")

Unnamed: 0,Name,Score
1,A,
2,B,90.0
3,C,80.0


In [87]:
# Default left join on index
df1.join(df2)

Unnamed: 0,Name,Score
1,A,
2,B,90.0
3,C,80.0


In [74]:
# Outer join on index
df1.join(df2, how="outer")

Unnamed: 0,Name,Score
1,A,
2,B,90.0
3,C,80.0
4,,70.0


## ----------- COMBINE -----------
> Fills missing values from another DataFrame.

In [89]:
df1 = pd.DataFrame({"A": [1, None, 3], "B": [4, 5, None]})
df1

Unnamed: 0,A,B
0,1.0,4.0
1,,5.0
2,3.0,


In [92]:
df2 = pd.DataFrame({"A": [None, 20, 30], "B": [40, 50, 60]})
df2

Unnamed: 0,A,B
0,,40
1,20.0,50
2,30.0,60


In [94]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1.0,4.0
1,20.0,5.0
2,3.0,60.0
