In [1]:
import pandas as pd
import matplotlib
%matplotlib inline 
import numpy as np

In [2]:
### Data transformation from previous notebooks
nyc = pd.read_csv('data/central-park-raw.csv', parse_dates=[0])
nyc.columns = [x.strip() for x in nyc.columns]
nyc.columns = [x.replace(' ', '_') for x in nyc.columns]
nyc.PrecipitationIn.replace("T", '0.001')
nyc.PrecipitationIn = pd.to_numeric(nyc.PrecipitationIn.replace("T", '0.001'))
nyc['Events'] = nyc.Events.fillna('')

# Grouping

Pandas allows us to perform aggregates calculations over grouped portions of ``Series`` or ``DataFrames``. The ``.groupby`` method is the low level workhorse that enables this.

In [None]:
# We can group by a column, but if it has unique values it isn't useful
nyc.groupby('EST').mean()['CloudCover']

In [None]:
# Let's get the average cloud cover each month
nyc.groupby(nyc.EST.dt.month).mean()['CloudCover']

In [None]:
# The previous aggregated over every month, 
# what if we want to group by year and month?
nyc.groupby([nyc.EST.dt.year, nyc.EST.dt.month]).mean()['CloudCover']

In [None]:
# The previous aggregated over every month, 
# what if we want to group by year and month?
nyc.groupby([nyc.EST.dt.year.rename('year'), nyc.EST.dt.month]).mean()['CloudCover']

In [None]:
nyc.groupby([nyc.EST.dt.year.rename('year'), nyc.EST.dt.month]).mean(
)['CloudCover'].plot(figsize=(14,10))

In [None]:
# With the .agg method we can apply many functions
nyc.groupby([nyc.EST.dt.year.rename('year'), nyc.EST.dt.month]).agg(['mean', 'max', 'count'])

In [None]:
# Then plot
nyc.groupby([nyc.EST.dt.year.rename('year'), nyc.EST.dt.month]).agg(
    ['mean', 'max', 'count'])['Mean_TemperatureF'].plot()

In [None]:
# Or just look at a table for a column
nyc.groupby([nyc.EST.dt.year.rename('year'), nyc.EST.dt.month]).agg(
    ['mean', 'max', 'count'])['Max_TemperatureF']

## Grouping Assignment
With the nino dataset:
* Find the mean temperature for each year
* Find the count of entries for each year
* Find the max temperature for each year

In [3]:
# Data transformation from previous notebook
# col names in tao-all2.col from website
names = '''obs
year
month
day
date
latitude
longitude
zon.winds
mer.winds
humidity
air temp.
s.s.temp.'''.split('\n')

nino = pd.read_csv('data/tao-all2.dat.gz', sep=' ', names=names, na_values='.', 
                   parse_dates=[[1,2,3]])
nino.columns = [x.replace('.', '_').replace(' ', '_') for x in nino.columns]
nino['air_temp_F'] = nino.air_temp_ * 9/5 + 32
wind_cols = [x for x in nino.columns if x.endswith('winds')]
for c in wind_cols:
    nino['{}_mph'.format(c)] = nino[c] * 2.237
pd.to_datetime(nino.date, format='%y%m%d')
nino = nino.drop('obs', axis=1)

In [15]:
##nino
nino.groupby([nino.year_month_day.dt.year.rename('Years'),nino.air_temp_]).agg('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,date,latitude,longitude,zon_winds,mer_winds,humidity,s_s_temp_,air_temp_F,zon_winds_mph,mer_winds_mph
Years,air_temp_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1980,20.42,801014.000000,0.000000,-109.560000,-2.900000,0.400000,,20.5200,68.756,-6.487300,0.894800
1980,20.45,801012.000000,0.000000,-109.560000,-2.800000,-0.300000,,20.5100,68.810,-6.263600,-0.671100
1980,20.52,801013.000000,0.000000,-109.560000,-3.700000,0.100000,,20.7100,68.936,-8.276900,0.223700
1980,20.54,801011.000000,0.000000,-109.560000,-4.200000,1.200000,,20.7200,68.972,-9.395400,2.684400
1980,20.62,801016.000000,0.000000,-109.560000,-2.300000,1.600000,,20.4700,69.116,-5.145100,3.579200
1980,20.65,801009.000000,0.000000,-109.560000,-3.700000,2.200000,,20.7100,69.170,-8.276900,4.921400
1980,20.71,801010.000000,0.000000,-109.560000,-3.500000,1.100000,,20.7400,69.278,-7.829500,2.460700
1980,20.73,801015.000000,0.000000,-109.560000,-1.500000,-0.400000,,20.6200,69.314,-3.355500,-0.894800
1980,20.84,801017.000000,0.000000,-109.560000,-2.100000,0.100000,,20.5900,69.512,-4.697700,0.223700
1980,20.92,801018.000000,0.000000,-109.560000,-2.000000,1.300000,,20.7300,69.656,-4.474000,2.908100


# Pivoting

In [None]:
nyc.pivot_table(index=[nyc.EST.dt.year.rename('year'), nyc.EST.dt.month], aggfunc=[np.max, np.count_nonzero],
               values=['Max_Humidity', 'Max_Dew_PointF'])

In [None]:
nyc.pivot_table(index=[nyc.EST.dt.year.rename('year'), nyc.EST.dt.month], aggfunc=[np.max, np.count_nonzero],
               values=['Max_Humidity', 'Max_Dew_PointF']).plot(figsize=(14,10))

In [None]:
# We can "unstack" to pull a left index into a column (0 is the left most index)
nyc.pivot_table(index=[nyc.EST.dt.year.rename('year'), nyc.EST.dt.month], aggfunc=[np.max, np.count_nonzero],
               values=['Max_Humidity', 'Max_Dew_PointF']).unstack(0)

In [None]:
# We can "unstack" to pull a left index into a column (1 is the 2nd index)
nyc.pivot_table(index=[nyc.EST.dt.year.rename('year'), nyc.EST.dt.month], aggfunc=[np.max, np.count_nonzero],
               values=['Max_Humidity', 'Max_Dew_PointF']).unstack(1)

In [None]:
# Just use one value and one aggregation
nyc.pivot_table(index=[nyc.EST.dt.year.rename('year'), nyc.EST.dt.month], aggfunc=[np.max],
               values=['Mean_TemperatureF']).unstack(1)

In [None]:
# Just use one value and one aggregation by year
nyc.pivot_table(index=[nyc.EST.dt.year.rename('year'), nyc.EST.dt.month], aggfunc=[np.max],
               values=['Mean_TemperatureF']).unstack(1).plot(cmap='viridis', figsize=(14,10))

In [None]:
# Just use one value and one aggregation by month
nyc.pivot_table(index=[nyc.EST.dt.year.rename('year'), nyc.EST.dt.month], aggfunc=[np.max],
               values=['Mean_TemperatureF']).unstack(0).plot(cmap='viridis', figsize=(14,10))

## Pivoting Assignment
With the nino dataset:
* Pivot the nino data using the ``.pivot_table`` method. Group by year and month, the ``air_temp`` column. Reduce using the ``max``, ``min``, and ``np.mean`` functions. (You will either need to create a month column or use ``year_month_day.dt.month``)
* Plot a line plot of the previous pivot table

## Pivoting Bonus Assignment
* Using ``.groupby`` we can sometimes perform the same operation as pivot tables. Pivot the nino data using the ``.groupby`` method. Group by year and month, the ``air_temp_`` column. Reduce using the ``max``, ``min``, and ``np.mean`` functions using ``.groupby``. (Hint: Use the ``.agg`` method on the result of the group by)
* Use ``.unstack`` to see the mean ``air_temp_`` by year