In [1]:
# This notebook will show how to use pandas to aggregate the eaglei_2016_outage.csv so that we get the
# mean and median of the number of outages for a particular day (in this example 2016-06-20)

# see https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html for details about pandas groupby and 
# aggregation

In [2]:
# make sure the eaglei_outages_2016.csv file is in the data directory. It is too big to upload to github
# so you'll have to maintain a copy yourself. I've added an entry to .gitignore to make sure git doesn't
# try to commit this file to the repo
import pandas as pd
ei_2016_df = pd.read_csv("data/eaglei_outages_2016.csv", delimiter=',', skiprows=0, low_memory=False)
# converting the run_start_time column to pandas' 'Timestamp' format
ei_2016_df["datetimeformat"] = pd.to_datetime(ei_2016_df["run_start_time"], format="%Y-%m-%d %H:%M:%S")
ei_2016_df

Unnamed: 0,fips_code,county,state,sum,run_start_time,datetimeformat
0,2122,Kenai Peninsula,Alaska,657,2016-01-01 00:00:00,2016-01-01
1,5003,Ashley,Arkansas,7,2016-01-01 00:00:00,2016-01-01
2,5029,Conway,Arkansas,2,2016-01-01 00:00:00,2016-01-01
3,5069,Jefferson,Arkansas,41,2016-01-01 00:00:00,2016-01-01
4,5081,Little River,Arkansas,61,2016-01-01 00:00:00,2016-01-01
...,...,...,...,...,...,...
13306019,54061,Monongalia,West Virginia,472,2016-12-31 00:00:00,2016-12-31
13306020,54069,Ohio,West Virginia,37,2016-12-31 00:00:00,2016-12-31
13306021,54075,Pocahontas,West Virginia,200,2016-12-31 00:00:00,2016-12-31
13306022,54099,Wayne,West Virginia,2,2016-12-31 00:00:00,2016-12-31


In [3]:
type(ei_2016_df.iloc[0]["datetimeformat"])

pandas._libs.tslibs.timestamps.Timestamp

In [5]:
# selecting down to specific states can be done by 
ei_2016_swus_df = ei_2016_df.loc[ei_2016_df["state"].isin(["Arizona", "California", "Nevada"])]

# selecting only data for 2016-06-20
ei_20160620_swus_df = ei_2016_swus_df.loc[(ei_2016_swus_df["datetimeformat"] >= "2016-06-20") 
                                & (ei_2016_swus_df["datetimeformat"] < "2016-06-21")]

In [6]:
# grouping by the fips code so that we can aggregate the whole day's data on a county by county basis
# (each fips code represents a county)
ei_fipscode_group = ei_20160620_swus_df.groupby("fips_code")

In [7]:
# here's how the data is now grouped by each fips_code 
for name,group in ei_fipscode_group:
    print(name)
    print(group)
    break

4013
         fips_code    county    state  sum       run_start_time   
5966141       4013  Maricopa  Arizona  754  2016-06-20 00:00:00  \
5966602       4013  Maricopa  Arizona  799  2016-06-20 00:15:00   
5967038       4013  Maricopa  Arizona  798  2016-06-20 00:30:00   
5967477       4013  Maricopa  Arizona  845  2016-06-20 00:45:00   
5967927       4013  Maricopa  Arizona  904  2016-06-20 01:00:00   
...            ...       ...      ...  ...                  ...   
6007375       4013  Maricopa  Arizona   45  2016-06-20 21:30:00   
6007978       4013  Maricopa  Arizona   45  2016-06-20 21:45:00   
6008575       4013  Maricopa  Arizona   45  2016-06-20 22:00:00   
6009167       4013  Maricopa  Arizona   44  2016-06-20 22:15:00   
6009766       4013  Maricopa  Arizona   65  2016-06-20 23:00:00   

             datetimeformat  
5966141 2016-06-20 00:00:00  
5966602 2016-06-20 00:15:00  
5967038 2016-06-20 00:30:00  
5967477 2016-06-20 00:45:00  
5967927 2016-06-20 01:00:00  
...       

In [8]:
# now we can aggregate the data in each group (i.e. in each county) to get the mean and median of the  (and 
# others too if you want) 

ei_fipscode_agg = ei_fipscode_group[["county", "state", "sum"]].agg({"county":"first", "state":"first", "sum":["mean", "median"] })
ei_fipscode_agg


Unnamed: 0_level_0,county,state,sum,sum
Unnamed: 0_level_1,first,first,mean,median
fips_code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
4013,Maricopa,Arizona,321.747253,78.0
4019,Pima,Arizona,257.803922,25.0
4021,Pinal,Arizona,11.142857,14.0
6029,Kern,California,9.634146,6.0
6037,Los Angeles,California,4202.835165,3114.0
6059,Orange,California,2530.582418,2466.0
6065,Riverside,California,970.714286,502.0
6067,Sacramento,California,81.047619,101.0
6071,San Bernardino,California,586.318681,626.0
6073,San Diego,California,375.088889,424.0
