In [9]:
#
# Dependences

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
# Read the data and show a short sample
measles = pd.read_csv("measles.csv")
measles.head()

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita
0,192801,AL,ALABAMA,MEASLES,97,3.67
1,192801,AR,ARKANSAS,MEASLES,76,4.11
2,192801,AZ,ARIZONA,MEASLES,8,1.9
3,192801,CA,CALIFORNIA,MEASLES,74,1.38
4,192801,CO,COLORADO,MEASLES,85,8.38


In [None]:
#
# transform week value YYYYMM to MM and YYYY
# and show a short sample again.
# Transformation takes place with an anonymous function
# passed into .apply().
#
# First, we use the function int(str(x)[0:4]) to pull out
# the year value from the YYYYWW entity in the dataset,
# assigning it to new column ["year"] within the dataframe.
#
# Second, we use the function int(str(x)[4:7]) to pull out
# the week value from YYYYWW, assigning it to new column
# ["week_num"] within the dataframe.
#
# Finally, we drop the week column from the dataframe as it
# is extraneous, using the method .drop()

measles["year"] = measles["week"].apply(lambda x: int(str(x)[0:4]))
measles["week_num"] = measles["week"].apply(lambda x: int(str(x)[4:7]))
measles.drop("week", axis = 1, inplace = True)
measles.head(3)

In [None]:
def summarise(thisDataframe, byThisColumn):
    #Group data
    groupedByThisColumn = thisDataframe.groupby(by = byThisColumn)
    #Summarise data as Series then convert back to Dataframe
    cases_sum = pd.DataFrame(groupedByThisColumn["cases"].sum()).reset_index()
    cases_avg = pd.DataFrame(groupedByThisColumn["cases"].mean()).reset_index()
    avg_incidence_year = pd.DataFrame(groupedByThisColumn["incidence_per_capita"].mean()).reset_index()
    #Give columns sensible names
    avg_incidence_year = avg_incidence_year.rename(columns = {"incidence_per_capita": "avg_incidence_per_week"})
    cases_sum = cases_sum.rename(columns = {"cases": "total_cases_per_year"})
    cases_avg = cases_avg.rename(columns = {"cases": "avg_cases_per_week"})
    #Merge dataframes
    cases = pd.merge(cases_avg, cases_sum)
    newDataframe = pd.merge(avg_incidence_year, cases)
    return newDataframe

In [None]:
measles_yearly_data = summarise(measles, byThisColumn = ["year", "state_name"])
measles_yearly_data.head(3)

In [None]:
measles_year = summarise(measles, byThisColumn=["year"])
measles_year.head(3)

In [None]:
measles_year.plot(kind='scatter',x='year',y='avg_cases_per_week')
plt.show()