In [1]:
# This notebook will show how to use pandas to aggregate the eaglei_2016_outage.csv so that we get the
# mean and median of the number of outages for a particular day (in this example 2016-06-20)

# see https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html for details about pandas groupby and 
# aggregation

In [3]:
# make sure the eaglei_outages_2016.csv file is in the data directory. It is too big to upload to github
# so you'll have to maintain a copy yourself. I've added an entry to .gitignore to make sure git doesn't
# try to commit this file to the repo
import pandas as pd
ei_2016_df = pd.read_csv("data/eaglei_outages_2016.csv", delimiter=',', skiprows=0, low_memory=False)
# converting the run_start_time column to pandas' 'Timestamp' format
ei_2016_df["datetimeformat"] = pd.to_datetime(ei_2016_df["run_start_time"], format="%Y-%m-%d %H:%M:%S")
ei_2016_df

Unnamed: 0,fips_code,county,state,sum,run_start_time,datetimeformat
0,2122,Kenai Peninsula,Alaska,657,2016-01-01 00:00:00,2016-01-01
1,5003,Ashley,Arkansas,7,2016-01-01 00:00:00,2016-01-01
2,5029,Conway,Arkansas,2,2016-01-01 00:00:00,2016-01-01
3,5069,Jefferson,Arkansas,41,2016-01-01 00:00:00,2016-01-01
4,5081,Little River,Arkansas,61,2016-01-01 00:00:00,2016-01-01
...,...,...,...,...,...,...
13306019,54061,Monongalia,West Virginia,472,2016-12-31 00:00:00,2016-12-31
13306020,54069,Ohio,West Virginia,37,2016-12-31 00:00:00,2016-12-31
13306021,54075,Pocahontas,West Virginia,200,2016-12-31 00:00:00,2016-12-31
13306022,54099,Wayne,West Virginia,2,2016-12-31 00:00:00,2016-12-31


In [4]:
type(ei_2016_df.iloc[0]["datetimeformat"])

pandas._libs.tslibs.timestamps.Timestamp

In [5]:
# selecting down the data to only have the 2016-06-20 data
ei_20160620_df = ei_2016_df.loc[ei_2016_df["datetimeformat"] >= "2016-06-20" ].loc[ei_2016_df["datetimeformat"] < "2016-06-21"]


In [6]:
# grouping by the fips code so that we can aggregate the whole day's data on a county by county basis
# (each fips code represents a county)
ei_fipscode_group = ei_20160620_df.groupby("fips_code")

In [12]:
# here's how the data is now grouped by each fips_code (ba)
for name,group in ei_fipscode_group:
    print(name)
    print(group)
    break

1003
         fips_code   county    state  sum       run_start_time   
5979190       1003  Baldwin  Alabama    1  2016-06-20 08:45:00  \
5984274       1003  Baldwin  Alabama    1  2016-06-20 12:30:00   
5987838       1003  Baldwin  Alabama    1  2016-06-20 14:00:00   
5988506       1003  Baldwin  Alabama    1  2016-06-20 14:15:00   
5989204       1003  Baldwin  Alabama    2  2016-06-20 14:30:00   
5989907       1003  Baldwin  Alabama    1  2016-06-20 14:45:00   
5990614       1003  Baldwin  Alabama    1  2016-06-20 15:00:00   
5991322       1003  Baldwin  Alabama    1  2016-06-20 15:15:00   
5992029       1003  Baldwin  Alabama    1  2016-06-20 15:30:00   
5992728       1003  Baldwin  Alabama    1  2016-06-20 15:45:00   
5993424       1003  Baldwin  Alabama    8  2016-06-20 16:00:00   
5994101       1003  Baldwin  Alabama    9  2016-06-20 16:15:00   
5994779       1003  Baldwin  Alabama   10  2016-06-20 16:30:00   
5995463       1003  Baldwin  Alabama    9  2016-06-20 16:45:00   
59961

In [14]:
# now we can aggregate the data in each group (i.e. in each county) to get the mean and median of the  (and 
# others too if you want) 

ei_fipscode_agg = ei_fipscode_group[["county", "state", "sum"]].agg({"county":"first", "state":"first", "sum":["mean", "median"] })
ei_fipscode_agg


Unnamed: 0_level_0,county,state,sum,sum
Unnamed: 0_level_1,first,first,mean,median
fips_code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1003,Baldwin,Alabama,3.240000,1.0
1013,Butler,Alabama,2.384615,2.0
1031,Coffee,Alabama,10.428571,2.0
1039,Covington,Alabama,1.000000,1.0
1041,Crenshaw,Alabama,210.125000,334.0
...,...,...,...,...
55133,Waukesha,Wisconsin,9.285714,1.0
55135,Waupaca,Wisconsin,4.428571,5.0
55139,Winnebago,Wisconsin,1.000000,1.0
55141,Wood,Wisconsin,1.000000,1.0
