In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<font size=4>Past method is more manual and resource consuming; it involved filtering the dates for consecutive seven-day periods, then performing calculations on those groups with a nested for loop.

New method uses .rolling() to gain more, clearer information more efficiently; instead of weeks, we display a rolling seven-day average.</font>

In [2]:
# read in the raw data
new_cases = pd.read_csv("resources/covid19cases_test.csv")
new_cases

Unnamed: 0,date,area,area_type,population,cases,cumulative_cases,deaths,cumulative_deaths,total_tests,cumulative_total_tests,positive_tests,cumulative_positive_tests,reported_cases,cumulative_reported_cases,reported_deaths,cumulative_reported_deaths,reported_tests
0,2020-02-01,Alameda,County,1685886.0,3.0,3.0,0.0,0.0,4.0,4,0.0,0,0.0,0.0,0.0,0.0,
1,2020-02-02,Alameda,County,1685886.0,0.0,3.0,0.0,0.0,1.0,5,0.0,0,0.0,0.0,0.0,0.0,
2,2020-02-03,Alameda,County,1685886.0,0.0,3.0,0.0,0.0,0.0,5,0.0,0,0.0,0.0,0.0,0.0,
3,2020-02-04,Alameda,County,1685886.0,0.0,3.0,0.0,0.0,0.0,5,0.0,0,0.0,0.0,0.0,0.0,
4,2020-02-05,Alameda,County,1685886.0,0.0,3.0,0.0,0.0,1.0,6,0.0,0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39279,2021-11-01,Yuba,County,79290.0,16.0,10091.0,0.0,60.0,304.0,116237,18.0,11672,15.0,10066.0,0.0,83.0,192.0
39280,2021-11-02,Yuba,County,79290.0,8.0,10099.0,0.0,60.0,171.0,116408,10.0,11682,14.0,10080.0,0.0,83.0,406.0
39281,2021-11-03,Yuba,County,79290.0,6.0,10105.0,0.0,60.0,73.0,116481,6.0,11688,12.0,10092.0,0.0,83.0,247.0
39282,2021-11-04,Yuba,County,79290.0,0.0,10105.0,0.0,60.0,,116481,,11688,15.0,10107.0,0.0,83.0,370.0


<font size="5">Data Cleaning</font>

In [3]:
# new df with filtered for relevant columns
data = new_cases[["area", "cases", "deaths", "date", "population"]]

# columns renamed
data.columns = ["county", "cases", "deaths", "date", "population"]

# drop rows with nan values
data = data.dropna()

# sort df by the date column, ascending
data = data.sort_values("date")

# create a clean dataframe, filtering out rows that have irrelevant column values
clean_cases = data.loc[ (data["county"] != "Unknown") & (data["county"] != "Out of state")]

# create a list of counties for later iteration
counties = clean_cases["county"].unique().tolist()

In [4]:
clean_cases

Unnamed: 0,county,cases,deaths,date,population
0,Alameda,3.0,0.0,2020-02-01,1685886.0
18032,Napa,0.0,0.0,2020-02-01,139652.0
5796,El Dorado,0.0,0.0,2020-02-01,193098.0
32844,Stanislaus,0.0,0.0,2020-02-01,562303.0
10304,Kings,0.0,0.0,2020-02-01,156444.0
...,...,...,...,...,...
32842,Sonoma,0.0,0.0,2021-11-04,496668.0
1930,Amador,0.0,0.0,2021-11-04,38531.0
36706,Tuolumne,0.0,0.0,2021-11-04,52351.0
26402,San Joaquin,0.0,0.0,2021-11-04,782545.0


<font size=5>Loop through counties and apply .rolling() and per capita calculations</font>

In [5]:
# create empty dataframe; the new df will contain all calculations
column_names = ["county", "cases", "deaths", "date", "population", "average_cases",
                "cases_per_cap", "average_deaths", "deaths_per_cap"]
weekly_df = pd.DataFrame(columns = column_names)
weekly_df

Unnamed: 0,county,cases,deaths,date,population,average_cases,cases_per_cap,average_deaths,deaths_per_cap


In [6]:
# loop through counties, apply seven-day rolling window and calculations
# append results to empty dataframe

for county in counties:
    
    # filter dataframe for a given county
    df = clean_cases.loc[clean_cases["county"] == county]
    # reset the index
    df = df.reset_index(drop=True)
    
    # apply .rolling() to create a column of seven-day averages
    df["average_cases"] = df.cases.rolling(window=7).mean()
    # create another new column with per capita calculation applied
    df["cases_per_cap"] = round((df["average_cases"]/df["population"]) * 100000, 3)
    
    # same operations for average deaths and average per capita deaths
    df["average_deaths"] = df.deaths.rolling(window=7).mean()
    df["deaths_per_cap"] = round((df["average_deaths"]/df["population"]) * 100000, 3)
    
    # append temporary dataframe with calculations to our final dataframe
    weekly_df = weekly_df.append(df)
    
# display results
weekly_df

Unnamed: 0,county,cases,deaths,date,population,average_cases,cases_per_cap,average_deaths,deaths_per_cap
0,Alameda,3.0,0.0,2020-02-01,1685886.0,,,,
1,Alameda,0.0,0.0,2020-02-02,1685886.0,,,,
2,Alameda,0.0,0.0,2020-02-03,1685886.0,,,,
3,Alameda,0.0,0.0,2020-02-04,1685886.0,,,,
4,Alameda,0.0,0.0,2020-02-05,1685886.0,,,,
...,...,...,...,...,...,...,...,...,...
638,Lassen,2.0,0.0,2021-10-31,30065.0,7.142857,23.758,0.142857,0.475
639,Lassen,7.0,0.0,2021-11-01,30065.0,6.285714,20.907,0.000000,0.000
640,Lassen,4.0,0.0,2021-11-02,30065.0,5.857143,19.482,0.000000,0.000
641,Lassen,1.0,0.0,2021-11-03,30065.0,3.714286,12.354,0.000000,0.000


In [7]:
import json

In [8]:
# use .to_dict and 'records' orient to make our desired list
# can be used as 'data.js', or .insert_many with PyMongo
list_d = weekly_df.to_dict(orient='records')

In [9]:
# write to a text file for our data.js later
with open("rolling_dict.txt", "w") as file:
    file.write(json.dumps(list_d))