# Pandas Merge and GroupBy

### Concat() same as union in SQL

In [4]:
import pandas as pd, numpy as np

df = pd.DataFrame(np.random.randn(10, 4))

In [5]:
df

Unnamed: 0,0,1,2,3
0,-0.029745,0.064979,-0.835443,-1.827547
1,-2.061194,0.235757,-0.841971,-0.239369
2,0.868756,-0.418926,-0.616612,-0.443226
3,0.331649,-1.426765,-1.880597,-0.752639
4,0.393941,0.470096,0.478746,-0.425156
5,0.018812,-0.831884,0.604974,-1.377234
6,-0.653428,-0.949277,-1.437499,-0.653472
7,-1.314268,-0.892661,0.360582,0.735922
8,0.71024,1.369886,0.465722,-1.518588
9,0.069213,0.945934,-0.973135,-0.583926


In [9]:
pieces = [df[:3], df[3:7], df[7:]] # here we split the data into 3 pcs

In [10]:
pieces

[          0         1         2         3
 0 -0.029745  0.064979 -0.835443 -1.827547
 1 -2.061194  0.235757 -0.841971 -0.239369
 2  0.868756 -0.418926 -0.616612 -0.443226,
           0         1         2         3
 3  0.331649 -1.426765 -1.880597 -0.752639
 4  0.393941  0.470096  0.478746 -0.425156
 5  0.018812 -0.831884  0.604974 -1.377234
 6 -0.653428 -0.949277 -1.437499 -0.653472,
           0         1         2         3
 7 -1.314268 -0.892661  0.360582  0.735922
 8  0.710240  1.369886  0.465722 -1.518588
 9  0.069213  0.945934 -0.973135 -0.583926]

In [11]:
pd.concat(pieces) # we combine using concat

Unnamed: 0,0,1,2,3
0,-0.029745,0.064979,-0.835443,-1.827547
1,-2.061194,0.235757,-0.841971,-0.239369
2,0.868756,-0.418926,-0.616612,-0.443226
3,0.331649,-1.426765,-1.880597,-0.752639
4,0.393941,0.470096,0.478746,-0.425156
5,0.018812,-0.831884,0.604974,-1.377234
6,-0.653428,-0.949277,-1.437499,-0.653472
7,-1.314268,-0.892661,0.360582,0.735922
8,0.71024,1.369886,0.465722,-1.518588
9,0.069213,0.945934,-0.973135,-0.583926


### Using .append()
But even though adding a column to a DataFrame is relatively fast, adding a row requires a copy, and may be expensive. It's faster to concatenate two data-frames than to append rows.

In [12]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

In [13]:
pd.merge(left, right, on = 'key') # Make an inner join between tables created above on column key:

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


In [14]:
# nner join is done automatically with merge(). If you want to do other types of joins like the outer, left or right, you should use the parameter, how.

In [15]:
pd.merge(left, right, on='key', how='outer')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## Grouping

#### By group by, we are referring to a process involving the following steps:

- Splitting the data into groups based on some criteria
- Applying a function to each group independently
- Combining the results into a data structure

In [23]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                             'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                       'C': np.random.randn(8),
                       'D': np.random.randn(8)})

df

Unnamed: 0,A,B,C,D
0,foo,one,-0.540529,0.317737
1,bar,one,-1.436238,-1.219696
2,foo,two,-0.931946,1.261895
3,bar,three,-0.884532,0.209378
4,foo,two,-0.462059,-0.950376
5,bar,two,0.42059,-0.304385
6,foo,one,0.551568,0.513525
7,foo,three,-0.868709,0.048791


In [17]:
# Group the DataFrame by column A and sum the values of C and D.

In [21]:
df.groupby('A').sum()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,onethreetwo,1.008215,0.317528
foo,onetwotwoonethree,0.281721,3.62212


- We can also group by multiple columns. This operation will create a new DataFrame with Multilevel Index.

In [19]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.992418,-0.212261
bar,three,-0.354946,-0.16132
bar,two,0.370743,0.691109
foo,one,-1.349069,1.365345
foo,three,-0.142117,1.914279
foo,two,1.772908,0.342496


In [22]:
# If we want to omit column D from group by A. Notice the string column gets addes too
df.groupby('A').sum(['C', 'D'])

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.008215,0.317528
foo,0.281721,3.62212
