In [2]:
import numpy as np
import pandas as pd

# Merge

Pandas provides various facilities for easily combining Series and DataFrame objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations.

Concatenating pandas objects together with ```concat()``` (equivalent to UNION in SQL)

In [3]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-0.311585,1.137019,0.049983,0.058612
1,0.35002,0.199368,-1.130089,0.544362
2,2.322865,-0.618659,-0.09436,-0.485714
3,0.972525,0.979593,-0.343757,0.771454
4,0.53093,1.517545,-0.245307,0.573258
5,-0.843964,-0.486711,0.635018,-0.765792
6,0.253478,0.870965,0.775187,0.152897
7,-0.508949,3.070281,-0.888309,0.097856
8,-0.948921,-1.892521,0.554347,0.916818
9,-2.05926,-0.776713,0.583026,0.978814


In [5]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.311585,1.137019,0.049983,0.058612
1,0.35002,0.199368,-1.130089,0.544362
2,2.322865,-0.618659,-0.09436,-0.485714
3,0.972525,0.979593,-0.343757,0.771454
4,0.53093,1.517545,-0.245307,0.573258
5,-0.843964,-0.486711,0.635018,-0.765792
6,0.253478,0.870965,0.775187,0.152897
7,-0.508949,3.070281,-0.888309,0.097856
8,-0.948921,-1.892521,0.554347,0.916818
9,-2.05926,-0.776713,0.583026,0.978814


DataFrame also has a method called ```.append()```. But even though adding a column to a DataFrame is relatively fast, adding a row requires a copy, and may be expensive. It's faster to concatenate two data-frames than appending rows.

To join two data-frames, we use ```merge()``` in Pandas (equivalent to JOIN in SQL).

In [6]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

#make an inner join between tables created above on column 'key'
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


Inner join is done automatically with ```merge()```. If you want to do other types of joins like the outer, left or right, you should use the parameter, how.

In [7]:
pd.merge(left, right, on='key', how='outer')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


# Grouping

By ```group by```, we are referring to a process involving the following steps:

* Splitting the data into groups based on some criteria
* Applying a function to each group independently
* Combining the results into a data structure

In [9]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                             'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

In [10]:
# Group the DataFrame by column A and sum the values of C and D.
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.258832,2.547901
foo,-2.983748,0.432399


In [11]:
# groupby multiple columns
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.293309,1.325227
bar,three,-0.377758,-0.444088
bar,two,0.929899,1.666761
foo,one,-3.079178,-0.235163
foo,three,1.86039,-0.637342
foo,two,-1.76496,1.304904


# ```.agg()``` method

You cannot apply two aggregation functions in 1 groupby statement in Pandas without the ```.agg()``` method

In [12]:
df.groupby('A').agg({'C': np.sum, 'D': np.max})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.258832,1.666761
foo,-2.983748,2.323433


# Stack

The ```stack()``` method "compresses" a level in the DataFrame's columns. Let's see now what it means :).

In [13]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                         'foo', 'foo', 'qux', 'qux'],
                        ['one', 'two', 'one', 'two',
                         'one', 'two', 'one', 'two']]))

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

df2 = df[:4]

Now, we are going to use the ```stack()``` function to "compress" the columns into the index.

In [25]:
df2.stack()

first  second   
bar    one     A   -0.023202
               B   -2.050686
       two     A   -2.031994
               B   -0.134240
baz    one     A   -0.301071
               B    0.107153
       two     A   -0.828051
               B   -1.096428
dtype: float64

With a "stacked" DataFrame or Series (having a MultiIndex as the index), the inverse operation of stack() is unstack(), which by default unstacks the **last level**:

In [26]:
df2.unstack(0)

Unnamed: 0_level_0,A,A,B,B
first,bar,baz,bar,baz
second,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,-0.023202,-0.301071,-2.050686,0.107153
two,-2.031994,-0.828051,-0.13424,-1.096428


In [24]:
df2.unstack(1)

Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,-0.023202,-2.031994,-2.050686,-0.13424
baz,-0.301071,-0.828051,0.107153,-1.096428


# Pivot Tables

In [27]:
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                       'B': ['A', 'B', 'C'] * 4,
                       'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                       'D': np.random.randn(12),
                       'E': np.random.randn(12)})

pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.883518,0.286887
one,B,-0.141815,-0.362897
one,C,0.178645,-1.230894
three,A,0.086816,
three,B,,-0.521393
three,C,-1.700078,
two,A,,0.14621
two,B,-0.84498,
two,C,,-0.108735


# Applying Functions

The appropriate method to use depends on whether your function expects to operate on an entire ```DataFrame``` or ```Series```, or row- or column-wise.
* tablewise function application: ```pipe()```
* row or column-wise function application: ```apply()```

# Tablewise function application

In [29]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    
    df['city_name'] = df['city_and_code'].str.split(",").str.get(0)
    
    return df

def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    
    col = 'city_name'
    df['city_and_country'] = df[col] + country_name
    
    return df


df_p = pd.DataFrame({'city_and_code': ['Chicago, IL']})

add_country_name(extract_city_name(df_p), country_name='US')

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


Pandas encourages us to use ```pipe()``` for the problem above, which is known as 'method chaining'. ```pipe``` makes it easy to use your own or another library’s functions in method chains, alongside Pandas’ methods.

In [30]:
(df_p.pipe(extract_city_name)
         .pipe(add_country_name, country_name="US"))

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


# Row or Column-wise Function Application

Arbitrary functions can be applied along the axes of a DataFrame using the ```apply()```method, which, like the descriptive statistics methods, takes an optional axis argument.

In [31]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

# pre-build numpy function
df.apply(np.mean)

one      0.682826
two     -0.367018
three   -0.388128
dtype: float64

In [32]:
# pre-build numpy function
df.apply(np.mean, axis=1)

a    1.083594
b    0.377213
c   -0.568426
d   -1.088764
dtype: float64

In [33]:
# own lambda function
df.apply(lambda x: x.max() - x.min())

one      2.518826
two      1.958778
three    1.253359
dtype: float64

In [34]:
# pre-build numpy function
df.apply(np.cumsum)

Unnamed: 0,one,two,three
a,2.07633,0.090858,
b,2.490973,0.771195,0.036658
c,2.048477,-0.507245,0.052317
d,,-1.468072,-1.164384


In [35]:
# pre-build numpy function
df.apply(np.exp)

Unnamed: 0,one,two,three
a,7.975149,1.095113,
b,1.513831,1.974545,1.037338
c,0.642431,0.278471,1.015782
d,,0.382577,0.296206


You can use `apply()` to apply your own function:

In [36]:
def own_function(x):
    return x*x

df.apply(own_function)

Unnamed: 0,one,two,three
a,4.311147,0.008255,
b,0.171929,0.46286,0.001344
c,0.195803,1.63441,0.000245
d,,0.923187,1.480361


You may also pass additional arguments and keyword arguments to the ```apply()``` method.

In [37]:
def subtract_and_divide(x, sub, divide=1):
    return (x - sub) / divide

df.apply(subtract_and_divide, args=(5,3)) # args has to be iterable. Therefore, even if you pass only 1 argument, 
                                                # you have to pass it as a tuple:  args=(5,)

Unnamed: 0,one,two,three
a,-0.974557,-1.636381,
b,-1.528452,-1.439887,-1.654447
c,-1.814165,-2.092814,-1.661447
d,,-1.986942,-2.072234


In [38]:
def subtract(x, sub):
    return (x - sub)

df.apply(subtract, args=(5,))

Unnamed: 0,one,two,three
a,-2.92367,-4.909142,
b,-4.585357,-4.319662,-4.963342
c,-5.442496,-6.278441,-4.984341
d,,-5.960826,-6.216701
