# Chapter 4 (continued)
Starting on page 228 - Aggregating Data

## Imports

In [1]:
import numpy as np
import pandas as pd

## Aggregating data

In [2]:
fb = pd.read_csv(
    '../data/fb_2018.csv', index_col='date', parse_dates=True
).assign(trading_volume=lambda df_: pd.cut(
    df_.volume, bins=3, labels=['low', 'med', 'high']
))

In [3]:
weather = pd.read_csv(
    '../data/weather_by_station.csv', index_col='date',
    parse_dates=True
)

In [5]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Summarizing DataFrames

In [6]:
fb.agg({
    'open': np.mean, 'high': np.max, 'low': np.min,
    'close': np.mean, 'volume': np.sum
})

open            171.45
high            218.62
low             123.02
close           171.51
volume   6949682394.00
dtype: float64

In [7]:
(weather.query('station == "GHCND:USW00094728"')
 .pivot(columns='datatype', values='value')
 [['SNOW', 'PRCP']].sum()
)

datatype
SNOW   1007.00
PRCP   1665.30
dtype: float64

In [8]:
fb.agg({
    'open': 'mean',
    'high': ['min', 'max'],
    'low': ['min', 'max'],
    'close': 'mean'
})

Unnamed: 0,open,high,low,close
mean,171.45,,,171.51
min,,129.74,123.02,
max,,218.62,214.27,


### Aggregating by group

In [9]:
fb.groupby('trading_volume').mean()

Unnamed: 0_level_0,open,high,low,close,volume
trading_volume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
low,171.36,173.46,169.31,171.43,24547207.71
med,175.82,179.42,172.11,175.14,79072559.12
high,167.73,170.48,161.57,168.16,141924023.33


In [10]:
(fb
 .groupby('trading_volume')
 ['close'].agg(['min', 'max', 'mean'])
)

Unnamed: 0_level_0,min,max,mean
trading_volume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
low,124.06,214.67,171.43
med,152.22,217.5,175.14
high,160.06,176.26,168.16


In [17]:
fb_agg = (fb
 .groupby('trading_volume')
 .agg({
     'open': 'mean', 'high': ['min', 'max'],
     'low': ['min', 'max'], 'close': 'mean'
 })
)
fb_agg

Unnamed: 0_level_0,open,high,high,low,low,close
Unnamed: 0_level_1,mean,min,max,min,max,mean
trading_volume,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
low,171.36,129.74,216.2,123.02,212.6,171.43
med,175.82,162.85,218.62,150.75,214.27,175.14
high,167.73,161.1,180.13,149.02,173.75,168.16


In [13]:
fb_agg.loc['med', 'low']['min']

150.75

In [14]:
fb_agg.columns

MultiIndex([( 'open', 'mean'),
            ( 'high',  'min'),
            ( 'high',  'max'),
            (  'low',  'min'),
            (  'low',  'max'),
            ('close', 'mean')],
           )

In [18]:
fb_agg.columns = ['_'.join(col_agg) for col_agg in fb_agg.columns]
fb_agg.head()

Unnamed: 0_level_0,open_mean,high_min,high_max,low_min,low_max,close_mean
trading_volume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
low,171.36,129.74,216.2,123.02,212.6,171.43
med,175.82,162.85,218.62,150.75,214.27,175.14
high,167.73,161.1,180.13,149.02,173.75,168.16


In [22]:
(weather
 .loc['2018-10']
 .query('datatype == "PRCP"')
 .groupby(level=0).mean()
 .squeeze()
 )

  .groupby(level=0).mean()


date
2018-10-01    0.01
2018-10-02    2.23
2018-10-03   19.69
2018-10-04    0.32
2018-10-05    0.97
2018-10-06    0.06
2018-10-07    0.67
2018-10-08    0.16
2018-10-09    1.04
2018-10-10    0.15
2018-10-11    3.00
2018-10-12   34.77
2018-10-13    1.90
2018-10-14    1.49
2018-10-15    1.06
2018-10-16    2.49
2018-10-17    0.03
2018-10-18    0.01
2018-10-19    0.00
2018-10-20    1.89
2018-10-21    0.54
2018-10-22    0.02
2018-10-23    0.01
2018-10-24    0.09
2018-10-25    0.00
2018-10-26    0.02
2018-10-27   18.42
2018-10-28   12.66
2018-10-29    1.76
2018-10-30    0.17
2018-10-31    0.01
Name: value, dtype: float64

In [25]:
(weather
 .query('datatype == "PRCP"')
 .groupby(['station_name', pd.Grouper(freq='Q')])
 .sum()
 .unstack()
 .sample(5, random_state=1)
 )

  .sum()


Unnamed: 0_level_0,value,value,value,value
date,2018-03-31,2018-06-30,2018-09-30,2018-12-31
station_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"WANTAGH 1.1 NNE, NY US",279.9,216.8,472.5,277.2
"STATEN ISLAND 1.4 SE, NY US",379.4,295.3,438.8,409.9
"SYOSSET 2.0 SSW, NY US",323.5,263.3,355.5,459.9
"STAMFORD 4.2 S, CT US",338.0,272.1,424.7,390.0
"WAYNE TWP 0.8 SSW, NJ US",246.2,295.3,620.9,422.0


In [29]:
(weather
 .query('datatype == "PRCP"')
 .groupby(level=0).mean()
 .groupby(pd.Grouper(freq='M'))
 .sum()
 .value
 .nlargest()
 )

  .groupby(level=0).mean()


date
2018-11-30   210.59
2018-09-30   193.09
2018-08-31   192.45
2018-07-31   160.98
2018-02-28   158.11
Name: value, dtype: float64

In [35]:
(weather
 .query('datatype == "PRCP"')
 .rename(dict(value='prcp'), axis=1)
 .groupby(level=0)
 .mean()
 .groupby(pd.Grouper(freq='M'))
 .transform(np.sum)
 ['2018-01-28':'2018-02-03']
 )

  .mean()


Unnamed: 0_level_0,prcp
date,Unnamed: 1_level_1
2018-01-28,69.31
2018-01-29,69.31
2018-01-30,69.31
2018-01-31,69.31
2018-02-01,158.11
2018-02-02,158.11
2018-02-03,158.11


In [39]:
(weather
 .query('datatype == "PRCP"')
 .rename(dict(value='prcp'), axis=1)
 .groupby(level=0).mean()
 .assign(
     total_prcp_in_month=lambda df_: df_.groupby(
         pd.Grouper(freq='M')).transform(np.sum),
     pct_monthly_prcp=lambda df_: df_.prcp.div(df_.total_prcp_in_month)
 )
 .nlargest(5, 'pct_monthly_prcp')
 )

  .groupby(level=0).mean()


Unnamed: 0_level_0,prcp,total_prcp_in_month,pct_monthly_prcp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-10-12,34.77,105.63,0.33
2018-01-13,21.66,69.31,0.31
2018-03-02,38.77,137.46,0.28
2018-04-16,39.34,140.57,0.28
2018-04-17,37.3,140.57,0.27


### Pivot tables and crosstabs

In [40]:
# pg 238 (243 of PDF)