In [1]:
import numpy as np
import pandas as pd

#combine and merge datasets
#pd.merge: connects based on keys
#pd.concat: stack datsets upon an axis
#combine_first : splice together opening data to fill in missing values in one object with
#values from another

In [2]:
#df.merge : Database-style joins
#these are sql-like operations that are similar to those found in relational databases
#simple example:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                   'data1': pd.Series(range(7), dtype='Int64')})

In [3]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                   'data2': pd.Series(range(3), dtype='Int64')})

In [4]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [5]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [6]:
#Note: we are using the Int64 extension dtype
#many to one join
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [7]:
pd.merge(df1, df2, how='inner') 

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [8]:
#note: we did not specify the column on which we are joining
#it is good to explicitly specify, though Pandas will use overlapping column names as keys
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [9]:
#SAME AS ABOVE
#note: we did not specify the column on which we are joining
#it is good to explicitly specify, though Pandas will use overlapping column names as keys
pd.merge(df1, df2, how='inner', on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [10]:
#Typically the order of column output in pd.merge operations is not specified beforehand.
#If we have different columns on each object, specify beforehand.
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                   'data1': pd.Series(range(7), dtype='Int64')})

df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                   'data2': pd.Series(range(3), dtype='Int64')})

In [11]:
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [12]:
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [13]:
pd.merge(df3, df4, left_on='lkey', right_on='rkey')
#we keep the 'a' and 'b' values because they are not found in both DataFrames
#this is an inner join
#INNER JOIN: keys are the result of the intersection, or the common set found in both tables

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [14]:
#same as above
pd.merge(df3, df4, left_on='lkey', right_on='rkey', how='inner')
#we keep the 'a' and 'b' values because they are not found in both DataFrames
#this is an inner join
#INNER JOIN: keys are the result of the intersection, or the common set found in both tables

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [16]:
#other options: left join, right join, outer join
#outer join : union of the keys, combines applying the left join, and the right join
pd.merge(df1, df2, how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0
