# Chapter 4 - Aggregating Pandas DataFrames

## Imports

In [32]:
import pandas as pd
import numpy as np

## Performing database-style operations on DataFrames

In [2]:
weather = pd.read_csv('../data/nyc_weather_2018.csv')
weather.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
1,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0015,",,N,",0.0
2,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0015,",,N,",0.0
3,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0017,",,N,",0.0
4,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0017,",,N,",0.0


### Querying DataFrames

In [3]:
snow_data = weather.query(
    'datatype == "SNOW" and value > 0'
    'and station.str.contains("US1NY")'
)

snow_data.head()

Unnamed: 0,date,datatype,station,attributes,value
114,2018-01-01T00:00:00,SNOW,GHCND:US1NYWC0019,",,N,",25.0
789,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0007,",,N,",41.0
794,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0018,",,N,",10.0
798,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0024,",,N,",89.0
800,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0030,",,N,",102.0


In [4]:
weather[
    (weather.datatype == 'SNOW') & (weather.value > 0)
    & weather.station.str.contains('US1NY')
].equals(snow_data)

True

### Merging DataFrames

In [5]:
station_info = pd.read_csv('../data/weather_stations.csv')
station_info.head()

Unnamed: 0,id,name,latitude,longitude,elevation
0,GHCND:US1CTFR0022,"STAMFORD 2.6 SSW, CT US",41.0641,-73.577,36.6
1,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
2,GHCND:US1NJBG0001,"BERGENFIELD 0.3 SW, NJ US",40.921298,-74.001983,20.1
3,GHCND:US1NJBG0002,"SADDLE BROOK TWP 0.6 E, NJ US",40.902694,-74.083358,16.8
4,GHCND:US1NJBG0003,"TENAFLY 1.3 W, NJ US",40.91467,-73.9775,21.6


In [6]:
station_info.id.describe()

count                   279
unique                  279
top       GHCND:US1CTFR0022
freq                      1
Name: id, dtype: object

In [7]:
weather.station.describe()

count                 78780
unique                  110
top       GHCND:USW00094789
freq                   4270
Name: station, dtype: object

In [8]:
station_info.shape[0], weather.shape[0]

(279, 78780)

In [9]:
def get_row_count(*dfs):
    return [df.shape[0] for df in dfs]

In [10]:
get_row_count(station_info, weather)

[279, 78780]

In [11]:
inner_join = weather.merge(station_info, left_on='station', right_on='id')
inner_join.sample(5, random_state=0)

Unnamed: 0,date,datatype,station,attributes,value,id,name,latitude,longitude,elevation
10739,2018-08-07T00:00:00,SNOW,GHCND:US1NJMN0069,",,N,",0.0,GHCND:US1NJMN0069,"LONG BRANCH 1.7 SSW, NJ US",40.275368,-74.006027,9.4
45188,2018-12-21T00:00:00,TMAX,GHCND:USW00014732,",,W,2400",16.7,GHCND:USW00014732,"LAGUARDIA AIRPORT, NY US",40.77944,-73.88035,3.4
59823,2018-01-15T00:00:00,WDF5,GHCND:USW00094741,",,W,",40.0,GHCND:USW00094741,"TETERBORO AIRPORT, NJ US",40.85,-74.06139,2.7
10852,2018-10-31T00:00:00,PRCP,GHCND:US1NJMN0069,"T,,N,",0.0,GHCND:US1NJMN0069,"LONG BRANCH 1.7 SSW, NJ US",40.275368,-74.006027,9.4
46755,2018-05-05T00:00:00,SNOW,GHCND:USW00014734,",,W,",0.0,GHCND:USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",40.6825,-74.1694,2.1


In [12]:
weather.merge(
    station_info.rename(dict(id='station'), axis=1),
    on='station'
).sample(5, random_state=0)

Unnamed: 0,date,datatype,station,attributes,value,name,latitude,longitude,elevation
10739,2018-08-07T00:00:00,SNOW,GHCND:US1NJMN0069,",,N,",0.0,"LONG BRANCH 1.7 SSW, NJ US",40.275368,-74.006027,9.4
45188,2018-12-21T00:00:00,TMAX,GHCND:USW00014732,",,W,2400",16.7,"LAGUARDIA AIRPORT, NY US",40.77944,-73.88035,3.4
59823,2018-01-15T00:00:00,WDF5,GHCND:USW00094741,",,W,",40.0,"TETERBORO AIRPORT, NJ US",40.85,-74.06139,2.7
10852,2018-10-31T00:00:00,PRCP,GHCND:US1NJMN0069,"T,,N,",0.0,"LONG BRANCH 1.7 SSW, NJ US",40.275368,-74.006027,9.4
46755,2018-05-05T00:00:00,SNOW,GHCND:USW00014734,",,W,",0.0,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",40.6825,-74.1694,2.1


In [13]:
left_join = station_info.merge(
    weather, left_on='id', right_on='station', how='left'
)

In [14]:
right_join = weather.merge(
    station_info, left_on='station', right_on='id', how='right'
)

In [15]:
right_join[right_join.datatype.isna()].head()

Unnamed: 0,date,datatype,station,attributes,value,id,name,latitude,longitude,elevation
0,,,,,,GHCND:US1CTFR0022,"STAMFORD 2.6 SSW, CT US",41.0641,-73.577,36.6
344,,,,,,GHCND:US1NJBG0001,"BERGENFIELD 0.3 SW, NJ US",40.921298,-74.001983,20.1
345,,,,,,GHCND:US1NJBG0002,"SADDLE BROOK TWP 0.6 E, NJ US",40.902694,-74.083358,16.8
718,,,,,,GHCND:US1NJBG0005,"WESTWOOD 0.8 ESE, NJ US",40.983041,-74.015858,15.8
719,,,,,,GHCND:US1NJBG0006,"RAMSEY 0.6 E, NJ US",41.058611,-74.134068,112.2


In [16]:
left_join.sort_index(axis=1)\
    .sort_values(['date','station'], ignore_index=True)\
        .equals(right_join.sort_index(axis=1).sort_values(['date','station'],
                                                          ignore_index=True))

True

In [17]:
get_row_count(inner_join,left_join,right_join)

[78780, 78949, 78949]

In [18]:
outer_join = weather.merge(
    station_info[station_info.id.str.contains('US1NY')],
    left_on='station', right_on='id',
    how='outer', indicator=True
)

In [19]:
outer_join

Unnamed: 0,date,datatype,station,attributes,value,id,name,latitude,longitude,elevation,_merge
0,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,,,,,,left_only
1,2018-01-02T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,,,,,,left_only
2,2018-01-03T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,,,,,,left_only
3,2018-01-05T00:00:00,DAPR,GHCND:US1CTFR0039,",,N,",2.0,,,,,,left_only
4,2018-01-05T00:00:00,MDPR,GHCND:US1CTFR0039,",,N,",15.5,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...
78797,,,,,,GHCND:US1NYSF0122,"NORTH BABYLON 0.5 E, NY US",40.731777,-73.315529,13.4,right_only
78798,,,,,,GHCND:US1NYWC0003,"WHITE PLAINS 3.1 NNW, NY US",41.063900,-73.772200,71.0,right_only
78799,,,,,,GHCND:US1NYWC0005,"HARRISON 4.1 SSW, NY US",40.963890,-73.723179,24.1,right_only
78800,,,,,,GHCND:US1NYWC0009,"NEW ROCHELLE 1.3 S, NY US",40.904000,-73.777000,21.9,right_only


In [20]:
pd.concat([
    outer_join.query(f'_merge == "{kind}"')\
        .sample(2, random_state=0)
    for kind in outer_join._merge.unique()
]).sort_index()

Unnamed: 0,date,datatype,station,attributes,value,id,name,latitude,longitude,elevation,_merge
23634,2018-04-12T00:00:00,PRCP,GHCND:US1NYNS0043,",,N,",0.0,GHCND:US1NYNS0043,"PLAINVIEW 0.4 ENE, NY US",40.785919,-73.466873,56.7,both
25742,2018-03-25T00:00:00,PRCP,GHCND:US1NYSF0061,",,N,",0.0,GHCND:US1NYSF0061,"CENTERPORT 0.9 SW, NY US",40.891689,-73.383133,53.6,both
60645,2018-04-16T00:00:00,TMIN,GHCND:USW00094741,",,W,",3.9,,,,,,left_only
70764,2018-03-23T00:00:00,SNWD,GHCND:US1NJHD0002,",,N,",203.0,,,,,,left_only
78790,,,,,,GHCND:US1NYQN0033,"HOWARD BEACH 0.4 NNW, NY US",40.662099,-73.841345,2.1,right_only
78800,,,,,,GHCND:US1NYWC0009,"NEW ROCHELLE 1.3 S, NY US",40.904,-73.777,21.9,right_only


In [21]:
dirty_data = pd.read_csv(
    '../data/dirty_data.csv', index_col='date'
).drop_duplicates().drop(columns='SNWD')

dirty_data.head()

Unnamed: 0_level_0,station,PRCP,SNOW,TMAX,TMIN,TOBS,WESF,inclement_weather
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01T00:00:00,?,0.0,0.0,5505.0,-40.0,,,
2018-01-02T00:00:00,GHCND:USC00280907,0.0,0.0,-8.3,-16.1,-12.2,,False
2018-01-03T00:00:00,GHCND:USC00280907,0.0,0.0,-4.4,-13.9,-13.3,,False
2018-01-04T00:00:00,?,20.6,229.0,5505.0,-40.0,,19.3,True
2018-01-05T00:00:00,?,0.3,,5505.0,-40.0,,,


In [22]:
valid_station = dirty_data.query('station != "?"')\
    .drop(columns=['WESF', 'station'])

In [23]:
station_with_wesf = dirty_data.query('station == "?"')\
    .drop(columns=['station', 'TOBS', 'TMIN', 'TMAX'])

In [24]:
valid_station.merge(
    station_with_wesf, how='left',
    left_index=True, right_index=True,
    suffixes=('', '_?')
).query('WESF > 0').head()

Unnamed: 0_level_0,PRCP,SNOW,TMAX,TMIN,TOBS,inclement_weather,PRCP_?,SNOW_?,WESF,inclement_weather_?
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-30T00:00:00,0.0,0.0,6.7,-1.7,-0.6,False,1.5,13.0,1.8,True
2018-03-08T00:00:00,48.8,,1.1,-0.6,1.1,False,28.4,,28.7,
2018-03-13T00:00:00,4.1,51.0,5.6,-3.9,0.0,True,3.0,13.0,3.0,True
2018-03-21T00:00:00,0.0,0.0,2.8,-2.8,0.6,False,6.6,114.0,8.6,True
2018-04-02T00:00:00,9.1,127.0,12.8,-1.1,-1.1,True,14.0,152.0,15.2,True


In [25]:
valid_station.join(
    station_with_wesf, how='left', rsuffix='_?',
).query('WESF > 0').head()

Unnamed: 0_level_0,PRCP,SNOW,TMAX,TMIN,TOBS,inclement_weather,PRCP_?,SNOW_?,WESF,inclement_weather_?
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-30T00:00:00,0.0,0.0,6.7,-1.7,-0.6,False,1.5,13.0,1.8,True
2018-03-08T00:00:00,48.8,,1.1,-0.6,1.1,False,28.4,,28.7,
2018-03-13T00:00:00,4.1,51.0,5.6,-3.9,0.0,True,3.0,13.0,3.0,True
2018-03-21T00:00:00,0.0,0.0,2.8,-2.8,0.6,False,6.6,114.0,8.6,True
2018-04-02T00:00:00,9.1,127.0,12.8,-1.1,-1.1,True,14.0,152.0,15.2,True


In [27]:
weather.set_index('station', inplace=True)
station_info.set_index('id', inplace=True)

In [28]:
weather.index.intersection(station_info.index)

Index(['GHCND:US1CTFR0039', 'GHCND:US1NJBG0015', 'GHCND:US1NJBG0017',
       'GHCND:US1NJBG0018', 'GHCND:US1NJBG0023', 'GHCND:US1NJBG0030',
       'GHCND:US1NJBG0039', 'GHCND:US1NJBG0044', 'GHCND:US1NJES0018',
       'GHCND:US1NJES0024',
       ...
       'GHCND:US1NJBG0037', 'GHCND:USC00284987', 'GHCND:US1NJES0031',
       'GHCND:US1NJES0029', 'GHCND:US1NJMD0086', 'GHCND:US1NJMS0097',
       'GHCND:US1NJMN0081', 'GHCND:US1NJMD0088', 'GHCND:US1NJES0040',
       'GHCND:US1NYQN0029'],
      dtype='object', length=110)

In [29]:
weather.index.difference(station_info.index)

Index([], dtype='object')

In [30]:
station_info.index.difference(weather.index)

Index(['GHCND:US1CTFR0022', 'GHCND:US1NJBG0001', 'GHCND:US1NJBG0002',
       'GHCND:US1NJBG0005', 'GHCND:US1NJBG0006', 'GHCND:US1NJBG0008',
       'GHCND:US1NJBG0011', 'GHCND:US1NJBG0012', 'GHCND:US1NJBG0013',
       'GHCND:US1NJBG0020',
       ...
       'GHCND:USC00308322', 'GHCND:USC00308749', 'GHCND:USC00308946',
       'GHCND:USC00309117', 'GHCND:USC00309270', 'GHCND:USC00309400',
       'GHCND:USC00309466', 'GHCND:USC00309576', 'GHCND:USW00014708',
       'GHCND:USW00014786'],
      dtype='object', length=169)

In [31]:
weather.index.unique().union(station_info.index)

Index(['GHCND:US1CTFR0022', 'GHCND:US1CTFR0039', 'GHCND:US1NJBG0001',
       'GHCND:US1NJBG0002', 'GHCND:US1NJBG0003', 'GHCND:US1NJBG0005',
       'GHCND:US1NJBG0006', 'GHCND:US1NJBG0008', 'GHCND:US1NJBG0010',
       'GHCND:US1NJBG0011',
       ...
       'GHCND:USW00014708', 'GHCND:USW00014732', 'GHCND:USW00014734',
       'GHCND:USW00014786', 'GHCND:USW00054743', 'GHCND:USW00054787',
       'GHCND:USW00094728', 'GHCND:USW00094741', 'GHCND:USW00094745',
       'GHCND:USW00094789'],
      dtype='object', length=279)

## Using DataFrame operations to enrich data

In [33]:
weather = pd.read_csv(
    '../data/nyc_weather_2018.csv', parse_dates=['date']
)

fb = pd.read_csv(
    '../data/fb_2018.csv', index_col='date', parse_dates=True
)

### Arithmetic and statistics

In [35]:
fb.assign(
    abs_z_score_volume=lambda df_: df_.volume.sub(df_.volume.mean()).div(df_.volume.std()).abs()
).query('abs_z_score_volume > 3')

Unnamed: 0_level_0,open,high,low,close,volume,abs_z_score_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-03-19,177.01,177.17,170.06,172.56,88140060,3.145078
2018-03-20,167.47,170.2,161.95,168.15,129851768,5.315169
2018-03-21,164.8,173.4,163.3,169.39,106598834,4.105413
2018-03-26,160.82,161.1,149.02,160.06,126116634,5.120845
2018-07-26,174.89,180.13,173.75,176.26,169803668,7.393705


In [37]:
fb.assign(
    volume_pct_change = fb.volume.pct_change(),
    pct_change_rank=lambda db_: db_.volume_pct_change.abs().rank(ascending=False)
).nsmallest(5, 'pct_change_rank')

Unnamed: 0_level_0,open,high,low,close,volume,volume_pct_change,pct_change_rank
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-12,178.06,181.48,177.4,179.37,77551299,7.087876,1.0
2018-03-19,177.01,177.17,170.06,172.56,88140060,2.611789,2.0
2018-07-26,174.89,180.13,173.75,176.26,169803668,1.628841,3.0
2018-09-21,166.64,167.25,162.81,162.93,45994800,1.428956,4.0
2018-03-26,160.82,161.1,149.02,160.06,126116634,1.352496,5.0


In [38]:
fb['2018-01-11':'2018-01-12']

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-11,188.4,188.4,187.38,187.77,9588587
2018-01-12,178.06,181.48,177.4,179.37,77551299


In [39]:
(fb > 215).any()

open       True
high       True
low       False
close      True
volume     True
dtype: bool

In [40]:
(fb > 215).all()

open      False
high      False
low       False
close     False
volume     True
dtype: bool

### Binning

In [41]:
(fb.volume.value_counts() > 1).sum()

0

In [42]:
(fb.volume.value_counts() > 1).any()

False

In [43]:
volume_binned = pd.cut(
    fb.volume, bins=3, labels=['low', 'med', 'high']
)

volume_binned.value_counts()

low     240
med       8
high      3
Name: volume, dtype: int64

In [44]:
fb[volume_binned == 'high'].sort_values('volume', ascending=False)

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-07-26,174.89,180.13,173.75,176.26,169803668
2018-03-20,167.47,170.2,161.95,168.15,129851768
2018-03-26,160.82,161.1,149.02,160.06,126116634


In [47]:
type(volume_binned)

pandas.core.series.Series

In [49]:
fb['2018-07-25':'2018-07-26']

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-07-25,215.715,218.62,214.27,217.5,64592585
2018-07-26,174.89,180.13,173.75,176.26,169803668


In [50]:
fb['2018-03-16':'2018-03-20']

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-03-16,184.49,185.33,183.41,185.09,24403438
2018-03-19,177.01,177.17,170.06,172.56,88140060
2018-03-20,167.47,170.2,161.95,168.15,129851768


In [51]:
volume_qbinned = pd.qcut(
    fb.volume, q=4, labels=['q1', 'q2', 'q3', 'q4']
)

volume_qbinned.value_counts()

q1    63
q2    63
q4    63
q3    62
Name: volume, dtype: int64

### Applying functions

In [52]:
central_park_weather = weather.query(
    'station == "GHCND:USW00094728"'
).pivot(index='date', columns='datatype', values='value')

In [55]:
oct_weather_z_scores = (central_park_weather
                        .loc['2018-10', ['TMIN', 'TMAX', 'PRCP']]
                        .apply(lambda x: x.sub(x.mean()).div(x.std()))
                        )

oct_weather_z_scores.describe().T

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
datatype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TMIN,31.0,-1.790682e-16,1.0,-1.339112,-0.751019,-0.474269,1.065152,1.843511
TMAX,31.0,1.951844e-16,1.0,-1.305582,-0.870013,-0.138258,1.011643,1.604016
PRCP,31.0,4.6557740000000005e-17,1.0,-0.394438,-0.394438,-0.394438,-0.240253,3.936167


In [56]:
oct_weather_z_scores.query('PRCP > 3').PRCP

date
2018-10-27    3.936167
Name: PRCP, dtype: float64

In [57]:
central_park_weather.loc['2018-10', 'PRCP'].describe()

count    31.000000
mean      2.941935
std       7.458542
min       0.000000
25%       0.000000
50%       0.000000
75%       1.150000
max      32.300000
Name: PRCP, dtype: float64

### Window calculations

#### Rolling windows

In [58]:
central_park_weather.loc['2018-10'].assign(
    rolling_PRCP=lambda db_: db_.PRCP.rolling('3D').sum()
)[['PRCP', 'rolling_PRCP']].head(7).T

date,2018-10-01,2018-10-02,2018-10-03,2018-10-04,2018-10-05,2018-10-06,2018-10-07
datatype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PRCP,0.0,17.5,0.0,1.0,0.0,0.0,0.0
rolling_PRCP,0.0,17.5,17.5,18.5,1.0,1.0,0.0


In [59]:
central_park_weather.loc['2018-10'].rolling('3D').mean().head(7).iloc[:,:6]

datatype,AWND,PRCP,SNOW,SNWD,TMAX,TMIN
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-01,0.9,0.0,0.0,0.0,24.4,17.2
2018-10-02,0.9,8.75,0.0,0.0,24.7,17.75
2018-10-03,0.966667,5.833333,0.0,0.0,24.233333,17.566667
2018-10-04,0.8,6.166667,0.0,0.0,24.233333,17.2
2018-10-05,1.033333,0.333333,0.0,0.0,23.133333,16.3
2018-10-06,0.833333,0.333333,0.0,0.0,22.033333,16.3
2018-10-07,1.066667,0.0,0.0,0.0,22.6,17.4


In [61]:
(central_park_weather
 .loc['2018-10']
 .rolling('3D')
 .agg({'TMAX':'max', 'TMIN':'min', 'AWND':'mean', 'PRCP':'sum'})
 .join(
     central_park_weather[['TMAX', 'TMIN', 'AWND', 'PRCP']],
     lsuffix='_rolling'
 )
 .sort_index(axis=1)
)

datatype,AWND,AWND_rolling,PRCP,PRCP_rolling,TMAX,TMAX_rolling,TMIN,TMIN_rolling
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-10-01,0.9,0.9,0.0,0.0,24.4,24.4,17.2,17.2
2018-10-02,0.9,0.9,17.5,17.5,25.0,25.0,18.3,17.2
2018-10-03,1.1,0.966667,0.0,17.5,23.3,25.0,17.2,17.2
2018-10-04,0.4,0.8,1.0,18.5,24.4,25.0,16.1,16.1
2018-10-05,1.6,1.033333,0.0,1.0,21.7,24.4,15.6,15.6
2018-10-06,0.5,0.833333,0.0,1.0,20.0,24.4,17.2,15.6
2018-10-07,1.1,1.066667,0.0,0.0,26.1,26.1,19.4,15.6
2018-10-08,1.8,1.133333,0.0,0.0,23.3,26.1,17.8,17.2
2018-10-09,0.3,1.066667,0.0,0.0,25.0,26.1,18.9,17.8
2018-10-10,1.2,1.1,0.0,0.0,26.7,26.7,21.7,17.8


#### Expanding windows

In [62]:
central_park_weather.loc['2018-06'].assign(
    TOTAL_PRCP=lambda df_: df_.PRCP.cumsum(),
    AVG_PRCP=lambda df_: df_.PRCP.expanding().mean()
).head(10)[['PRCP', 'TOTAL_PRCP', 'AVG_PRCP']].T

date,2018-06-01,2018-06-02,2018-06-03,2018-06-04,2018-06-05,2018-06-06,2018-06-07,2018-06-08,2018-06-09,2018-06-10
datatype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
PRCP,6.9,2.0,6.4,4.1,0.0,0.0,0.0,0.0,0.0,0.3
TOTAL_PRCP,6.9,8.9,15.3,19.4,19.4,19.4,19.4,19.4,19.4,19.7
AVG_PRCP,6.9,4.45,5.1,4.85,3.88,3.233333,2.771429,2.425,2.155556,1.97


In [63]:
(central_park_weather
 ['2018-10-01':'2018-10-07']
 .expanding()
 .agg({
     'TMAX':np.max, 'TMIN':np.min, 'AWND':np.mean, 'PRCP':np.sum
 }).join(
     central_park_weather[['TMAX', 'TMIN', 'AWND', 'PRCP']],
     lsuffix='_expanding'
 ).sort_index(axis=1)
 )

datatype,AWND,AWND_expanding,PRCP,PRCP_expanding,TMAX,TMAX_expanding,TMIN,TMIN_expanding
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-10-01,0.9,0.9,0.0,0.0,24.4,24.4,17.2,17.2
2018-10-02,0.9,0.9,17.5,17.5,25.0,25.0,18.3,17.2
2018-10-03,1.1,0.966667,0.0,17.5,23.3,25.0,17.2,17.2
2018-10-04,0.4,0.825,1.0,18.5,24.4,25.0,16.1,16.1
2018-10-05,1.6,0.98,0.0,18.5,21.7,25.0,15.6,15.6
2018-10-06,0.5,0.9,0.0,18.5,20.0,25.0,17.2,15.6
2018-10-07,1.1,0.928571,0.0,18.5,26.1,26.1,19.4,15.6


#### Exponentially weights moving windows

In [64]:
central_park_weather.assign(
    AVG=lambda df_: df_.TMAX.rolling('30D').mean(),
    EWMA=lambda df_:df_.TMAX.ewm(span=30).mean()
).loc['2018-09-29':'2018-10-08', ['TMAX', 'EWMA', 'AVG']].T

date,2018-09-29,2018-09-30,2018-10-01,2018-10-02,2018-10-03,2018-10-04,2018-10-05,2018-10-06,2018-10-07,2018-10-08
datatype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
TMAX,22.2,21.1,24.4,25.0,23.3,24.4,21.7,20.0,26.1,23.3
EWMA,24.410887,24.197281,24.21036,24.261304,24.199285,24.212234,24.050154,23.788854,23.93796,23.896802
AVG,24.723333,24.573333,24.533333,24.46,24.163333,23.866667,23.533333,23.07,23.143333,23.196667


### Pipes

In [65]:
def get_info(df):
    return '%d rows, %d cols and max closing Z-score: %d' % (*df.shape, df.close.max())

In [66]:
get_info(fb.loc['2018-Q1']
         .apply(lambda x: (x - x.mean()) / x.std()))

'61 rows, 5 cols and max closing Z-score: 1'

In [70]:
(fb
 .loc['2018-Q1']
 .apply(lambda x: (x - x.mean())/x.std())
 .pipe(get_info)
)

'61 rows, 5 cols and max closing Z-score: 1'

In [71]:
fb.pipe(pd.DataFrame.rolling, '20D').mean().equals(
    fb.rolling('20D').mean()
)

True

In [72]:
from window_calc import window_calc

In [73]:
window_calc??

[0;31mSignature:[0m [0mwindow_calc[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mfunc[0m[0;34m,[0m [0magg_dict[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mwindow_calc[0m[0;34m([0m[0mdf[0m[0;34m,[0m [0mfunc[0m[0;34m,[0m [0magg_dict[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""[0m
[0;34m    Run a window calculation of your choice on a `DataFrame` object.[0m
[0;34m    [0m
[0;34m    Parameters:[0m
[0;34m        - df: The `DataFrame` object to run the calculation on.[0m
[0;34m        - func: The window calculation method that takes `df`[0m
[0;34m          as the first argument.[0m
[0;34m        - agg_dict: Information to pass to `agg()`, could be a[0m
[0;34m          dictionary mapping the columns to the aggregation[0m
[0;34m          function to use

In [77]:
window_calc(fb, pd.DataFrame.expanding, np.median).head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-02,177.68,181.58,177.55,181.42,18151903.0
2018-01-03,179.78,183.18,179.44,183.045,17519233.0
2018-01-04,181.88,184.78,181.33,184.33,16886563.0
2018-01-05,183.39,185.495,182.7148,184.5,15383729.5
2018-01-08,184.9,186.21,184.0996,184.67,16886563.0


In [78]:
window_calc(fb, pd.DataFrame.ewm, 'mean', span=3).head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-02,177.68,181.58,177.55,181.42,18151900.0
2018-01-03,180.48,183.713333,180.07,183.586667,17308340.0
2018-01-04,183.005714,185.14,182.372629,184.011429,15349800.0
2018-01-05,184.384,186.078667,183.73656,185.525333,14402990.0
2018-01-08,185.837419,187.534839,185.07511,186.947097,16256790.0


In [79]:
window_calc(
    central_park_weather.loc['2018-10'],
    pd.DataFrame.rolling,
    {'TMAX':'max', 'TMIN':'min', 'AWND':'mean', 'PRCP':'sum'},
    '3D'
).head()

datatype,TMAX,TMIN,AWND,PRCP
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-10-01,24.4,17.2,0.9,0.0
2018-10-02,25.0,17.2,0.9,17.5
2018-10-03,25.0,17.2,0.966667,17.5
2018-10-04,25.0,16.1,0.8,18.5
2018-10-05,24.4,15.6,1.033333,1.0


## Aggregating data

In [80]:
# pg 229 (PDF page 235)