# Agenda

1. Pivot tables
2. Stack, unstack, melt
3. Joins
4. `filter` on columns

In [1]:
import numpy as np
import pandas as pd 
from pandas import Series, DataFrame

In [2]:
df = DataFrame(np.random.randint(0, 1000, [4,5]),
              index=list('abcd'),
              columns=list('vwxyz'))
df

Unnamed: 0,v,w,x,y,z
a,906,704,539,445,618
b,692,477,543,649,594
c,853,732,548,193,643
d,501,406,614,962,466


In [3]:
df.loc['e'] = [1,2,3,4,5]
df

Unnamed: 0,v,w,x,y,z
a,906,704,539,445,618
b,692,477,543,649,594
c,853,732,548,193,643
d,501,406,614,962,466
e,1,2,3,4,5


In [4]:
new_row = {'w':20, 'x':30, 'z':50}
df.loc['f'] = new_row

In [5]:
df

Unnamed: 0,v,w,x,y,z
a,906.0,704,539,445.0,618
b,692.0,477,543,649.0,594
c,853.0,732,548,193.0,643
d,501.0,406,614,962.0,466
e,1.0,2,3,4.0,5
f,,20,30,,50


In [6]:
df['w'] = [10, 20, 30, 40, 50, 60]
df

Unnamed: 0,v,w,x,y,z
a,906.0,10,539,445.0,618
b,692.0,20,543,649.0,594
c,853.0,30,548,193.0,643
d,501.0,40,614,962.0,466
e,1.0,50,3,4.0,5
f,,60,30,,50


In [7]:
df['u'] = {'a':100, 'c':300, 'e':500}

In [8]:
df

Unnamed: 0,v,w,x,y,z,u
a,906.0,10,539,445.0,618,100.0
b,692.0,20,543,649.0,594,
c,853.0,30,548,193.0,643,300.0
d,501.0,40,614,962.0,466,
e,1.0,50,3,4.0,5,500.0
f,,60,30,,50,


# Grouping

If we have a data frame with:
- One categorical column
- One numeric column

We can use `groupby` to calculate an aggregation method once per category, for all numeric rows matching.

In [9]:
df = pd.read_csv('taxi.csv')
df.groupby('passenger_count')['total_amount'].mean()

passenger_count
0    25.570000
1    17.368569
2    18.406306
3    17.994704
4    18.881648
5    17.211269
6    17.401355
Name: total_amount, dtype: float64

In [10]:
# we can do a 2D grouping

df.groupby(['VendorID', 'passenger_count'])['total_amount'].mean()

VendorID  passenger_count
1         0                  25.570000
          1                  16.941386
          2                  19.076807
          3                  19.002803
          4                  20.518657
          5                  20.466667
2         1                  17.904989
          2                  17.855770
          3                  17.359076
          4                  17.927913
          5                  17.192379
          6                  17.401355
Name: total_amount, dtype: float64

# Pivot table

A pivot table is a 2-dimensional grouping!  It's similar to our multi-index, but uses a table

- We need one categorical column -- this will be the index, for the rows
- A second categorical column - this will be for the columns
- A numeric column
- Aggregation method

In [11]:
df.pivot_table(index='VendorID', columns='passenger_count', values='total_amount')

passenger_count,0,1,2,3,4,5,6
VendorID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,25.57,16.941386,19.076807,19.002803,20.518657,20.466667,
2,,17.904989,17.85577,17.359076,17.927913,17.192379,17.401355


In [12]:
df.groupby(['VendorID', 'passenger_count'])['total_amount'].mean()

VendorID  passenger_count
1         0                  25.570000
          1                  16.941386
          2                  19.076807
          3                  19.002803
          4                  20.518657
          5                  20.466667
2         1                  17.904989
          2                  17.855770
          3                  17.359076
          4                  17.927913
          5                  17.192379
          6                  17.401355
Name: total_amount, dtype: float64