In [1]:
import numpy as np
import pandas as pd

#combine and merge datasets
#pd.merge: connects based on keys
#pd.concat: stack datsets upon an axis
#combine_first : splice together opening data to fill in missing values in one object with
#values from another

In [2]:
#df.merge : Database-style joins
#these are sql-like operations that are similar to those found in relational databases
#simple example:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                   'data1': pd.Series(range(7), dtype='Int64')})

In [3]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                   'data2': pd.Series(range(3), dtype='Int64')})

In [4]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [5]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [6]:
#Note: we are using the Int64 extension dtype
#many to one join
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [7]:
pd.merge(df1, df2, how='inner') 

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [8]:
#note: we did not specify the column on which we are joining
#it is good to explicitly specify, though Pandas will use overlapping column names as keys
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [9]:
#SAME AS ABOVE
#note: we did not specify the column on which we are joining
#it is good to explicitly specify, though Pandas will use overlapping column names as keys
pd.merge(df1, df2, how='inner', on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [10]:
#Typically the order of column output in pd.merge operations is not specified beforehand.
#If we have different columns on each object, specify beforehand.
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                   'data1': pd.Series(range(7), dtype='Int64')})

df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                   'data2': pd.Series(range(3), dtype='Int64')})

In [11]:
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [12]:
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [13]:
pd.merge(df3, df4, left_on='lkey', right_on='rkey')
#we keep the 'a' and 'b' values because they are not found in both DataFrames
#this is an inner join
#INNER JOIN: keys are the result of the intersection, or the common set found in both tables

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [14]:
#same as above
pd.merge(df3, df4, left_on='lkey', right_on='rkey', how='inner')
#we keep the 'a' and 'b' values because they are not found in both DataFrames
#this is an inner join
#INNER JOIN: keys are the result of the intersection, or the common set found in both tables

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [15]:
#other options: left join, right join, outer join
#outer join : union of the keys, combines applying the left join, and the right join
pd.merge(df1, df2, how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [16]:
#specify the specific types of keys
pd.merge(df3, df4, left_on='lkey', right_on='rkey', how='outer')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


In [17]:
#many to many mergers: forms the cartesian product of the matching keys
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': pd.Series(range(6), dtype='Int64')})

In [18]:
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                   'data2': pd.Series(range(5), dtype='Int64')})

In [19]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [20]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [21]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [22]:
#since there were three "b" rows in the left DataFrame and two in the right one, there are six "b"
#rows in the result. The join method passed to the "how" keyword argument
#affects only the key values that appear in the result
pd.merge(df1, df2, how='inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


In [23]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,0.0,3.0
2,b,1.0,1.0
3,b,1.0,3.0
4,b,5.0,1.0
5,b,5.0,3.0
6,a,2.0,0.0
7,a,2.0,2.0
8,a,4.0,0.0
9,a,4.0,2.0


In [24]:
#alternative method
pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                    'key2': ['one', 'two', 'one'],
                    'key3': pd.Series([1, 2, 3], dtype=pd.Int64Dtype())})

Unnamed: 0,key1,key2,key3
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [25]:
#alternative method
pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                    'key2': ['one', 'two', 'one'],
                    'key3': pd.Series(np.arange(1, 4), dtype=pd.Int64Dtype())})

Unnamed: 0,key1,key2,key3
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [26]:
#method specified in book
#to merge with multiple keys pass a list of column names
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                    'key2': ['one', 'two', 'one'],
                    'lval': pd.Series([1, 2, 3], dtype='Int64')})

In [27]:
left

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [28]:
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                     'key2': ['one', 'one', 'one', 'two'],
                     'rval': pd.Series([4, 5, 6, 7], dtype='Int64')}) #same type as above

In [29]:
right

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


In [30]:
#merge with multiple keys
#union of the two dfs: OUTER JOIN
pd.merge(left, right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [31]:
#intersection of the two dfs: INNER JOIN
pd.merge(left, right, on=['key1', 'key2'], how='inner')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1,4
1,foo,one,1,5
2,bar,one,3,6


In [32]:
#think of the keys that can be joined as an array of tuples that can be joined as a single key
#use reset_index to append the index if we need to preserve it
pd.merge(left, right, on=['key1', 'key2'], how='outer').reset_index()
#doesn't work, because we have to reset the index on each DataFrame

Unnamed: 0,index,key1,key2,lval,rval
0,0,foo,one,1.0,4.0
1,1,foo,one,1.0,5.0
2,2,foo,two,2.0,
3,3,bar,one,3.0,6.0
4,4,bar,two,,7.0


In [33]:
#think of the keys that can be joined as an array of tuples that can be joined as a single key
#use reset_index to append the index if we need to preserve it
test0 = pd.merge(left.reset_index(), right.reset_index(), on=['key1', 'key2'], how='outer')
test1 = test0[['index_x','index_y', 'key1', 'key2', 'lval',  'rval']]
test1

Unnamed: 0,index_x,index_y,key1,key2,lval,rval
0,0.0,0.0,foo,one,1.0,4.0
1,0.0,1.0,foo,one,1.0,5.0
2,1.0,,foo,two,2.0,
3,2.0,2.0,bar,one,3.0,6.0
4,,3.0,bar,two,,7.0


In [34]:
#Final issue: How do we treat overlapping column names
pd.merge(left, right, on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [35]:
#pandas.merge has a suffixes option for specifying strings to append to overlapping names
#on the left and right DataFrame.
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [36]:
#see table 8.2 (pg. 258) for argument reference on pd.merge
#here are some examples. 
#default = inner join
pd.merge(left, right)

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1,4
1,foo,one,1,5
2,bar,one,3,6


In [37]:
#see table 8.2 (pg. 258) for argument reference on pd.merge
#here are some examples. 
pd.merge(left, right, how='inner')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1,4
1,foo,one,1,5
2,bar,one,3,6


In [38]:
pd.merge(left, right, how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [39]:
pd.merge(left, right, how='left')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1,4.0
1,foo,one,1,5.0
2,foo,two,2,
3,bar,one,3,6.0


In [40]:
pd.merge(left, right, how='right')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4
1,foo,one,1.0,5
2,bar,one,3.0,6
3,bar,two,,7


In [41]:
#Merging on Index
#pass left_index=True
#pass right_index=True
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                     'value': pd.Series(range(6), dtype='Int64')})

In [42]:
right1 = pd.DataFrame({'group_val': [3.5, 7]},
                     index=['a', 'b'])

In [43]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [44]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [45]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [46]:
#the above is technically an inner join, like so:
pd.merge(left1, right1, left_on='key', right_index=True, how='inner')
#this merge preserves the unique values from left1 corresponding to rows in the input

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [47]:
#now an outer join using union of the keys
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [48]:
#heirarchically indexed data: 
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',
                              'Nevada', 'Nevada'],
                     'key2': [2000, 2001, 2002, 2001, 2002],
                     'data': pd.Series(range(5), dtype='Int64')})

In [49]:
righth_index = pd.MultiIndex.from_arrays(
    [
    ['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
    [2001, 2000, 2000, 2000, 2001, 2002]
    ]
)

In [50]:
righth = pd.DataFrame({'event1': pd.Series([0, 2, 4, 6, 8, 10], dtype='Int64',
                                          index=righth_index),
                       'event2': pd.Series([1, 3, 5, 7, 9, 11], dtype='Int64',
                                          index=righth_index)})

lefth

Unnamed: 0,key1,key2,data
0,Ohio,2000,0
1,Ohio,2001,1
2,Ohio,2002,2
3,Nevada,2001,3
4,Nevada,2002,4


In [51]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [52]:
#Indicate the multiple columns that we will be merging on as a list.
#Note how we handle duplicate index values with how='outer'
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0,4,5
0,Ohio,2000,0,6,7
1,Ohio,2001,1,8,9
2,Ohio,2002,2,10,11
3,Nevada,2001,3,0,1


In [53]:
pd.merge(lefth, righth, left_on=['key1', 'key2'],
         right_index=True, how='outer')

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4.0,5.0
0,Ohio,2000,0.0,6.0,7.0
1,Ohio,2001,1.0,8.0,9.0
2,Ohio,2002,2.0,10.0,11.0
3,Nevada,2001,3.0,0.0,1.0
4,Nevada,2002,4.0,,
4,Nevada,2000,,2.0,3.0


In [54]:
#We can also take both DataFrames and use their indices.
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                    index=['a', 'c', 'e'],
                    columns=['Ohio', 'Nevada']).astype('Int64')

In [55]:
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13., 14.]],
                     index=['b', 'c', 'd', 'e'],
                     columns=['Missouri', 'Alabama']).astype('Int64')

In [56]:
left2

Unnamed: 0,Ohio,Nevada
a,1,2
c,3,4
e,5,6


In [57]:
right2

Unnamed: 0,Missouri,Alabama
b,7,8
c,9,10
d,11,12
e,13,14


In [58]:
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [59]:
#df.join is an instance method
#works on DataFrames with same or similar indexes but columns that do not overlap
left2.join(right2, how='outer') #same thing as above

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [60]:
#df.join performs a left join on the join keys by default
left1.join(right2, how='left') #all the right1 values are <NA>

Unnamed: 0,key,value,Missouri,Alabama
0,a,0,,
1,b,1,,
2,a,2,,
3,a,3,,
4,b,4,,
5,c,5,,


In [61]:
#example 2:
left1.join(right2, how='right') #all the left1 values are <NA>

Unnamed: 0,key,value,Missouri,Alabama
b,,,7,8
c,,,9,10
d,,,11,12
e,,,13,14


In [62]:
#example 3: inner join
left1.join(right2, how='inner') #none of the keys are in common so the DataFrame is blank

Unnamed: 0,key,value,Missouri,Alabama


In [63]:
#we can also join on one of the keys by using the on method
#here we join data INTO the object whose join method was called
left1.join(right1, on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [64]:
#simple index-on-index merge
#pass a list of Dataframes to join
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                      index=['a', 'c', 'e', 'f'],
                      columns=['New York', 'Oregon'])

In [65]:
another

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [66]:
right2

Unnamed: 0,Missouri,Alabama
b,7,8
c,9,10
d,11,12
e,13,14


In [67]:
left2

Unnamed: 0,Ohio,Nevada
a,1,2
c,3,4
e,5,6


In [68]:
left2.join([right2, another]) #inner join (intersection of the keys)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1,2,,,7.0,8.0
c,3,4,9.0,10.0,9.0,10.0
e,5,6,13.0,14.0,11.0,12.0


In [69]:
left2.join([right2, another], how='outer') #outer join (union of the keys)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0
b,,,7.0,8.0,,
d,,,11.0,12.0,,
f,,,,,16.0,17.0


In [70]:
#left join
left2.join([right2, another], how='left')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1,2,,,7.0,8.0
c,3,4,9.0,10.0,9.0,10.0
e,5,6,13.0,14.0,11.0,12.0


In [71]:
#left join
left2.join([right2, another], how='inner') #inner join, intersection of the keys

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
c,3,4,9,10,9.0,10.0
e,5,6,13,14,11.0,12.0


In [72]:
#concatenating along an axis, i.e. stacking
np.arange(12)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [73]:
np.arange(12).reshape((3, 4))

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [74]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [75]:
#stack the dataframes horizontally
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [76]:
#stack the dataframes vertically
np.concatenate([arr, arr], axis=0)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [77]:
#not listing an axis effectively does the same thing
#stack the dataframes vertically
np.concatenate([arr, arr])

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [78]:
#pd.concat function
#1) if objects are indexed differently on different axes, do we combine distinct elements, or use only
#values in common.
#2) Do the concatenated data chunks need labelling.
#3) Does the concatenation axis need to be preserved? Sometimes we delete the default DataFrames.
#let's start with three examples that are series without overlapping indexes
s1 = pd.Series([0, 1], index=['a', 'b'], dtype='Int64')

In [79]:
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'], dtype='Int64')

In [80]:
s3 = pd.Series([5, 6], index=['f', 'g'], dtype='Int64')

In [81]:
#calling pd.concat with these objects glues the values and indexes
s1

a    0
b    1
dtype: Int64

In [82]:
s2

c    2
d    3
e    4
dtype: Int64

In [83]:
s3

f    5
g    6
dtype: Int64

In [84]:
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: Int64

In [85]:
#by default, this works along the axis "index" producing another series
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: Int64

In [86]:
pd.concat([s1, s2, s3], axis=0) #same thing as above

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: Int64

In [87]:
pd.concat([s1, s2, s3], axis='rows') #same thing as above

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: Int64

In [88]:
#if we pass axis='columns' the result will be a DataFrame.
pd.concat([s1, s2, s3], axis='columns') #this is an outer join, so there's no overlap on the other axis,
#outer join : Union of the columns

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [89]:
#same as above
#if we pass axis='columns' the result will be a DataFrame.
pd.concat([s1, s2, s3], axis=1) #this is an outer join, so there's no overlap on the other axis,
#outer join : Union of the columns

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [90]:
#this is the union of the keys meaning there are more empty spaces. We can also intersect
#by passing join=inner.
s4 = pd.concat([s1, s3])

In [91]:
s4

a    0
b    1
f    5
g    6
dtype: Int64

In [92]:
pd.concat([s1, s4], axis='columns') #

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [93]:
#same as above
pd.concat([s1, s4], axis=1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [94]:
pd.concat([s1, s4], axis='columns', join='inner')

Unnamed: 0,0,1
a,0,0
b,1,1


In [95]:
#concatrenated pieces not identifiable in the result. Let's say we want to create a
#heirarchical index
result = pd.concat([s1, s2, s3], keys=['one', 'two', 'three'])

In [96]:
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: Int64

In [97]:
result.unstack()

Unnamed: 0,a,b,c,d,e,f,g
one,0.0,1.0,,,,,
two,,,2.0,3.0,4.0,,
three,,,,,,5.0,6.0


In [98]:
#in the case of combining Series along axis='columns' keys become DataFrame headers
pd.concat([s1, s2, s3], axis='columns', keys=['one', 'two', 'three'])


Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [99]:
#same logic applies to DataFrames
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                  columns=['one', 'two'])

In [100]:
df2 = pd.DataFrame(np.arange(4).reshape(2, 2), index=['a', 'c'],
                  columns=['three', 'four'])

In [101]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [102]:
df2

Unnamed: 0,three,four
a,0,1
c,2,3


In [103]:
#concatenate without columns or keys
pd.concat([df1, df2]) #notice that all slots without both values are empty

Unnamed: 0,one,two,three,four
a,0.0,1.0,,
b,2.0,3.0,,
c,4.0,5.0,,
a,,,0.0,1.0
c,,,2.0,3.0


In [104]:
#concatenate without columns or keys
pd.concat([df1, df2], axis='columns') #intersection of the keys

Unnamed: 0,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [105]:
#concatenate without columns or keys
pd.concat([df1, df2], axis='columns', join='outer') #intersection of the keys 
#default is outer join, so we don't have to use it here

Unnamed: 0,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [106]:
pd.concat([df1, df2], axis='columns', keys=['level1', 'level2'])
#here the keys create a first level for the heirarchical index

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [107]:
test1 = pd.concat([df1, df2], axis='columns', keys=['level1', 'level2'])
test1['level1']
#here the keys create a first level for the heirarchical index

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [108]:
#same as above, so it is an outer join
pd.concat([df1, df2], axis='columns', join='outer',  keys=['level1', 'level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [109]:
#We can also do an inner join with the intersection of the columns
#note: not in book
pd.concat([df1, df2], axis='columns', join='inner',  keys=['level1', 'level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,0,1
c,4,5,2,3


In [110]:
#pass a dictionary of objects instead of a list:
#keys will be used for the 'keys' option
pd.concat({'level1': df1, 'level2': df2}, axis='columns')

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [111]:
#same as above
#pass a dictionary of objects instead of a list:
#keys will be used for the 'keys' option
pd.concat({'level1': df1, 'level2': df2}, axis=1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [112]:
#same as above, we specify that it is an outer join
#same as above
#pass a dictionary of objects instead of a list:
#keys will be used for the 'keys' option
pd.concat({'level1': df1, 'level2': df2}, join='outer', axis=1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [113]:
#Additional arguments about how heirarchical index is created
pd.concat([df1, df2], axis='columns', keys=['level1', 'level2'],
         names=['upper', 'lower'])

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [114]:
#last scenario: row indices without any relevant data
df1 = pd.DataFrame(np.random.standard_normal((3, 4)),
                  columns=['a', 'b', 'c', 'd'])

In [115]:
df2 = pd.DataFrame(np.random.standard_normal((2, 3)),
                  columns=['b', 'd', 'a'])

In [116]:
df1

Unnamed: 0,a,b,c,d
0,0.007382,0.702005,0.527528,-0.023832
1,-0.500029,-0.133561,0.495378,1.419129
2,-0.449504,0.17168,-0.057142,-0.168149


In [117]:
df2

Unnamed: 0,b,d,a
0,-1.071177,0.395153,-0.77852
1,-0.753676,1.55779,-0.149056


In [118]:
#here we can pass the ignore_index=True argument.
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,a,b,c,d
0,0.007382,0.702005,0.527528,-0.023832
1,-0.500029,-0.133561,0.495378,1.419129
2,-0.449504,0.17168,-0.057142,-0.168149
3,-0.77852,-1.071177,,0.395153
4,-0.149056,-0.753676,,1.55779


In [119]:
#handling overlap: these series have no common indices
a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan],
             index=['f', 'e', 'd', 'c', 'b', 'a'])

In [120]:
b = pd.Series([0, np.nan, 2., np.nan, np.nan, 5],
             index=['a', 'b', 'c', 'd', 'e', 'f'])

In [121]:
a

f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64

In [122]:
b

a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64

In [123]:
#note that the non-null values are selected
#numpy does not verify for the length of the essays
np.where(pd.isna(a), b, a)

array([0. , 2.5, 0. , 3.5, 4.5, 5. ])

In [124]:
#combine_first method "patches" the existing object with the new object we pass to it
a.combine_first(b)

a    0.0
b    4.5
c    3.5
d    0.0
e    2.5
f    5.0
dtype: float64

In [125]:
#combine_first on DataFrames: does the same thing, but column by column
#patch data from the missing object with the data from the new object
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                   'b': [np.nan, 2., np.nan, 6.],
                   'c': range(2, 18, 4)})

In [126]:
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                   'b': [np.nan, 3., 4., 6., 8.]})

In [127]:
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [128]:
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [129]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


In [131]:
df1 + df2 #only contains values in the rows where both have a value

Unnamed: 0,a,b,c
0,6.0,,
1,,5.0,
2,,,
3,,12.0,
4,,,


In [133]:
df1.add(df2, fill_value=0) #similar to df1.combine_first(df2)

Unnamed: 0,a,b,c
0,6.0,,2.0
1,4.0,5.0,6.0
2,5.0,4.0,10.0
3,3.0,12.0,14.0
4,7.0,8.0,


In [134]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,
