In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 3/19/2021: Discovered the data from CA Open Data Portal had been deprecated
New datasets are from CA HHS Open Data, with different organization

#### Skip the following development work to update data files

In [None]:
cases = pd.read_csv("resources/statewide_cases.csv")
cases

In [None]:
new_cases = pd.read_csv("resources/covid19cases_test.csv")
new_cases

In [None]:
# create new DataFrame to fit into old paradigm, now with population
data = new_cases[["area", "cumulative_cases", "cumulative_deaths", "cases", "deaths", "date", "population"]]

In [None]:
# renaming the columns is easier than changing the function
data.columns = ["county", "totalcountconfirmed", "totalcountdeaths", "newcountconfirmed", "newcountdeaths", "date", "population"]

In [None]:
# dropping the rows having NaN values 
data = data.dropna()

In [None]:
# dates sorted from earliest to latest (low to high)
data = data.sort_values("date")

In [None]:
# remove rows that are not counties or California
clean_cases = data.loc[ (data["county"] != "Unknown") & (data["county"] != "Out of state")]
# create list of 58 counties to iterate through
counties = clean_cases["county"].unique().tolist()

In [None]:
# given the dates in our dataset, create list of dates & array of integers to represent one-week periods
dates = clean_cases["date"].unique().tolist()
weeks = np.arange(0, len(dates), 7).tolist()

In [None]:
# create an empty DataFrame for appending calculated data to
column_names = ["County", "week", "week number", "total cases", "total deaths", "Population"]
weekly_df = pd.DataFrame(columns = column_names)

In [None]:
# for loops that iterate through each week period, collecting the total cases and deaths per county
counter = 1
for ref in weeks:
    for county in counties:
        try:
            df = clean_cases.loc[clean_cases["county"] == county]
            df = df.reset_index(drop=True)
            df = df.loc[(df.index >= ref) & (df.index <= (ref+6))]
            weekly_cases = df["newcountconfirmed"].sum()
            weekly_deaths = df["newcountdeaths"].sum()
            period = f"{dates[ref]} to {dates[ref+6]}"
            pop = df["population"].unique()[0]
            data = pd.DataFrame({"County": [county], "week": period, "week number": counter,
                                 "total cases": weekly_cases, "total deaths": weekly_deaths, "Population": pop})
            weekly_df = weekly_df.append(data)
                
        except:
            print("Not enough dates.")
        
    counter += 1

In [None]:
# create new columns for calculating weekly averages & per capita weekly averages
weekly_df["average cases"] = weekly_df["total cases"]/ 7
weekly_df["cases per cap"] = (weekly_df["average cases"]/weekly_df["Population"]) * 100000
weekly_df["average deaths"] = weekly_df["total deaths"]/ 7
weekly_df["deaths per cap"] = (weekly_df["average deaths"]/weekly_df["Population"]) * 100000

# format columns for rounded decimals and commas
weekly_df["Population"] = weekly_df["Population"].map("{:,}".format)
weekly_df["average cases"] = weekly_df["average cases"].astype(float).round(3)
weekly_df["cases per cap"] = weekly_df["cases per cap"].astype(float).round(3)
weekly_df["average deaths"] = weekly_df["average deaths"].astype(float).round(3)
weekly_df["deaths per cap"] = weekly_df["deaths per cap"].astype(float).round(3)

In [None]:
# verification
weekly_df

### Cells for updating data

In [2]:
# commentary for code above
def weekly(new_cases):
    data = new_cases[["area", "cumulative_cases", "cumulative_deaths", "cases", "deaths", "date", "population"]]
    data.columns = ["county", "totalcountconfirmed", "totalcountdeaths", "newcountconfirmed", "newcountdeaths", "date", "population"]
    data = data.dropna()
    data = data.sort_values("date")
    clean_cases = data.loc[ (data["county"] != "Unknown") & (data["county"] != "Out of state")]
    counties = clean_cases["county"].unique().tolist()
    dates = clean_cases["date"].unique().tolist()
    weeks = np.arange(0, len(dates), 7).tolist()
    column_names = ["County", "week", "week number", "total cases", "total deaths", "Population"]
    weekly_df = pd.DataFrame(columns = column_names)
    counter = 1
    for ref in weeks:
        for county in counties:
            try:
                df = clean_cases.loc[clean_cases["county"] == county]
                df = df.reset_index(drop=True)
                df = df.loc[(df.index >= ref) & (df.index <= (ref+6))]
                weekly_cases = df["newcountconfirmed"].sum()
                weekly_deaths = df["newcountdeaths"].sum()
                period = f"{dates[ref]} to {dates[ref+6]}"
                pop = df["population"].unique()[0]
                data = pd.DataFrame({"County": [county], "week": period, "week number": counter,
                                     "total cases": weekly_cases, "total deaths": weekly_deaths, "Population": pop})
                weekly_df = weekly_df.append(data)
            except:
                print("Not enough dates.")
        counter += 1
    weekly_df["average cases"] = weekly_df["total cases"]/ 7
    weekly_df["cases per cap"] = (weekly_df["average cases"]/weekly_df["Population"]) * 100000
    weekly_df["average deaths"] = weekly_df["total deaths"]/ 7
    weekly_df["deaths per cap"] = (weekly_df["average deaths"]/weekly_df["Population"]) * 100000
    weekly_df["Population"] = weekly_df["Population"].map("{:,}".format)
    weekly_df["average cases"] = weekly_df["average cases"].astype(float).round(3)
    weekly_df["cases per cap"] = weekly_df["cases per cap"].astype(float).round(3)
    weekly_df["average deaths"] = weekly_df["average deaths"].astype(float).round(3)
    weekly_df["deaths per cap"] = weekly_df["deaths per cap"].astype(float).round(3)
    return weekly_df

In [3]:
new_cases = pd.read_csv("resources/covid19cases_test.csv")
covid_weekly = weekly(new_cases)

KeyError: "['cumulative_deaths', 'cumulative_cases'] not in index"

In [None]:
covid_weekly

In [None]:
import json

In [None]:
# use .to_dict and 'records' orient to make our desired list
# can be used as 'data.js', or .insert_many with PyMongo
list_d = covid_weekly.to_dict(orient='records')

In [None]:
# write to a text file for our data.js later
with open("weekly_dict.txt", "w") as file:
    file.write(json.dumps(list_d))

### Chart generator

In [None]:
# operations here are for charting purposes
# use a dictionary to create multiple empty lists to store cases per cap
obj = {}
counties = covid_weekly["County"].unique().tolist()
for county in counties:
    obj[county] = []

# loop through counties and append cases per cap
for county in counties:
    county_df = covid_weekly.loc[covid_weekly["County"] == county]
    obj[county].append(county_df["cases per cap"].tolist())
    
x_axis = covid_weekly["week number"].unique().tolist()

In [None]:
# for loop that will plot and save a chart for each county listed
for county in counties:
    y = obj[county][0]

    plt.plot(x_axis, y, label = f"{county}")

    plt.title("Seven-Day Average COVID-19 Cases", fontweight="bold")
    plt.xlabel("Weeks: January 1, 2020 - December 2, 2020")
    plt.ylabel("Daily Rate per 100,000")
    plt.xticks(x_axis[::5])

    plt.legend(loc="best")
    plt.tight_layout()
    plt.grid()
    plt.savefig(f"Output_data/county_maps/{county}_cases.jpg", transparent=True, dpi=300)
    plt.clf()