In [None]:
#
# Dependences

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read the data and show a short sample
measles = pd.read_csv("measles.csv")
measles.head()

In [None]:
#
# transform week value YYYYMM to MM and YYYY
# and show a short sample again.
# Transformation takes place with an anonymous function
# passed into .apply().
#
# First, we use the function int(str(x)[0:4]) to pull out
# the year value from the YYYYWW entity in the dataset,
# assigning it to new column ["year"] within the dataframe.
#
# Second, we use the function int(str(x)[4:7]) to pull out
# the week value from YYYYWW, assigning it to new column
# ["week_num"] within the dataframe.
#
# Finally, we drop the week column from the dataframe as it
# is extraneous, using the method .drop()

measles["year"] = measles["week"].apply(lambda x: int(str(x)[0:4]))
measles["week_num"] = measles["week"].apply(lambda x: int(str(x)[4:7]))
measles.drop("week", axis = 1, inplace = True)
measles.head(3)

In [None]:
def aggregate(thisDataframe, byThisColumn):
    
    groupedByThisColumn = thisDataframe.groupby(by = byThisColumn)
    # aggregate sum(cases), avg(cases), avg(incidence)
    sumOfCases = pd.DataFrame(groupedByThisColumn["cases"].sum()).reset_index()
    avgOfCases = pd.DataFrame(groupedByThisColumn["cases"].mean()).reset_index()
    avgIncidenceYear = pd.DataFrame(groupedByThisColumn["incidence_per_capita"].mean()).reset_index()
    # rename columns 
    avgIncidenceYear = avgIncidenceYear.rename(columns = {"incidence_per_capita": "weekly average incidence"})
    sumOfCases = sumOfCases.rename(columns = {"cases": "yearly sum"})
    avgOfCases = avgOfCases.rename(columns = {"cases": "weekly average"})
    # put dataframes together
    measlesCases = pd.merge(avgOfCases, sumOfCases)
    newDataframe = pd.merge(avgIncidenceYear, measlesCases)
    return newDataframe

In [None]:
measlesByYearByState = aggregate(measles, byThisColumn = ["year", "state_name"])
measlesByYearByState.head(3)

In [None]:
measlesByYear = aggregate(measles, byThisColumn=["year"])
measlesByYear.head(3)

In [None]:
measlesByYear.plot(kind='scatter',x='year',y='weekly average')
plt.show()