# Stitch JHU data through various schema changes
* Reshape
* See what columns we need to derive

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd

In [None]:
# https://gist.github.com/rogerallen/1583593
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    # Add some other ones we found applicable
    'US Virgin Islands': 'VI', 
    'United States Virgin Islands': 'VI',
    'Grand Princess': 'Grand Princess',
    'Diamond Princess': 'Diamond Princess', 
    'From Diamond Princess': 'Diamond Princess', 
    'Diamond Princess cruise ship': 'Diamond Princess'
}

# reverse the dict
abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

## Pre 2/14/2020

In [None]:
pre214_cases_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/archived_data/archived_time_series/time_series_2019-ncov-Confirmed.csv"
pre214_deaths_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/archived_data/archived_time_series/time_series_2019-ncov-Deaths.csv"
pre214_recovered_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/archived_data/archived_time_series/time_series_2019-ncov-Recovered.csv" 

In [None]:
cases1 = pd.read_csv(pre214_cases_url)
deaths1 = pd.read_csv(pre214_deaths_url)
recovered1 = pd.read_csv(pre214_recovered_url)

Modified helper function, since columns are datetime, will extract date portion.

In [None]:
def parse_columns(df):
    """
    quick helper function to parse columns into values
    uses for pd.melt
    """
    df.columns = df.columns.str.split(' ').str[0]
    columns = list(df.columns)
    id_vars, dates = [], []

    for c in columns:
        if c.endswith("20"):
            dates.append(c)
        else:
            id_vars.append(c)
    return id_vars, dates

# Rename geography columns to be the same as future schemas
def rename_geog_cols(df):
    df.rename(columns = {'Country/Region':'Country_Region', 
                         'Province/State': 'Province_State', 
                         'Long': 'Lon'}, inplace = True)
    return df

In [None]:
id_vars, dates = parse_columns(cases1)
pre214_df = pd.melt(cases1, id_vars=id_vars, value_vars=dates, value_name="cases", var_name="date",
)

# melt deaths
id_vars, dates = parse_columns(deaths1)
deaths_df = pd.melt(deaths1, id_vars=id_vars, value_vars=dates, value_name="deaths")

# melt recovered
id_vars, dates = parse_columns(recovered1)
recovered_df = pd.melt(
    recovered1, id_vars=id_vars, value_vars=dates, value_name="recovered"
)

# join
pre214_df["deaths"] = deaths_df.deaths
pre214_df["recovered"] = recovered_df.recovered

pre214_df['date'] = pd.to_datetime(pre214_df.date)

pre214_df = rename_geog_cols(pre214_df)

## Pre 3/23
* will have overlap with pre 2/14...let's see how this resolves itself

In [None]:
pre323_cases_url= "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/archived_data/archived_time_series/time_series_19-covid-Confirmed_archived_0325.csv"
pre323_deaths_url= "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/archived_data/archived_time_series/time_series_19-covid-Deaths_archived_0325.csv"
pre323_recovered_url= "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/archived_data/archived_time_series/time_series_19-covid-Recovered_archived_0325.csv"

In [None]:
cases2 = pd.read_csv(pre323_cases_url)
deaths2 = pd.read_csv(pre323_deaths_url)
recovered2 = pd.read_csv(pre323_recovered_url)

In [None]:
id_vars, dates = parse_columns(cases2)
pre323_df = pd.melt(cases2, id_vars=id_vars, value_vars=dates, value_name="cases", var_name="date",
)

# melt deaths
id_vars, dates = parse_columns(deaths2)
deaths_df2 = pd.melt(deaths2, id_vars=id_vars, value_vars=dates, value_name="deaths")

# melt recovered
id_vars, dates = parse_columns(recovered2)
recovered_df2 = pd.melt(
    recovered2, id_vars=id_vars, value_vars=dates, value_name="recovered"
)

# join
pre323_df["deaths"] = deaths_df2.deaths
pre323_df["recovered"] = recovered_df2.recovered

pre323_df['date'] = pd.to_datetime(pre323_df.date)

pre323_df = rename_geog_cols(pre323_df)

## Pre 3/23 data is of 2 types
* Pre 3/10 county level...which need to be summed up to get state totals
* 3/10-3/23 state level...lose county level except for SCAG region

### Combine pre214_df and pre310_df and get rid of duplicates

In [None]:
# The US data shows county (with all zeros), state, and country counts.
# Filter out county because they're all zeros.
# Filter out country total, or else we double count.
us_pre323 = pre323_df[(pre323_df.Country_Region == 'US') & 
                      (pre323_df.Province_State.str.contains(',') == False) &
                     (pre323_df.Province_State != 'US')]

world_pre323 = pre323_df[pre323_df.Country_Region != 'US']

In [None]:
# Add state abbrev
us_pre323['state_abbrev'] = us_pre323.Province_State.map(us_state_abbrev)

# There are some duplicates, such as US Virgin Islands or Virgin Islands. Drop as long as state abbrev is the same.
us_pre323 = us_pre323.drop_duplicates(subset = ['Country_Region', 'Lat', 'Lon', 'state_abbrev',
                                               'date', 'cases', 'deaths', 'recovered'])

pre323_df2 = us_pre323.append(world_pre323)

In [None]:
pre310_df = pre323_df2[pre323_df2.date <= '3/10/20']
post310_df = pre323_df2[pre323_df2.date > '3/10/20']

combined_df1 = pre214_df.append(pre310_df).drop_duplicates().sort_values(['Country_Region', 
                                                                          'Province_State', 'date'])

In [None]:
# If there are still duplicates, it's because JHU sometimes did multiple updates a day
# This is ok, we'll keep the higher values for cases, deaths, recovered. 
for col in ['cases', 'deaths', 'recovered']:
    combined_df1[col] = combined_df1.groupby(['Province_State', 'Country_Region', 
                                            'Lat', 'Lon', 'date'])[col].transform('max').fillna(0).astype(int)


combined_df1 = combined_df1.drop_duplicates(subset = ['Province_State', 'Country_Region',
                                                   'Lat', 'Lon', 'date',
                                                   'cases', 'deaths', 'recovered'], keep = 'last')

In [None]:
# For the US, since Province/State contains county and state info, we need to derive our own state totals
us1 = combined_df1[combined_df1.Country_Region == "US"]
world1 = combined_df1[combined_df1.Country_Region != "US"]

In [None]:
# Define some functions we'll use to get totals
# Calculate US State totals
def us_state_totals(df):
    
    state_grouping_cols = ['Country_Region', 'state_abbrev', 'date']
    
    state_totals = df.groupby(state_grouping_cols).agg(
        {'cases':'sum', 'recovered':'sum', 'deaths':'sum'})
    
    state_totals.rename(columns = {'cases': 'state_cases',
                                  'recovered':'state_recovered', 
                                  'deaths': 'state_deaths'}, inplace = True)
    
    df = pd.merge(df, state_totals, on = state_grouping_cols)
    
    return df


# Calculate non-US Province_State totals
def province_totals(df):
    
    province_grouping_cols = ['Country_Region', 'Province_State', 'date']

    province_totals = df.groupby(province_grouping_cols).agg(
        {'cases':'sum', 'recovered':'sum', 'deaths':'sum'})
    
    province_totals.rename(columns = {'cases': 'state_cases',
                                  'recovered':'state_recovered', 
                                  'deaths': 'state_deaths'}, inplace = True)
    
    df = pd.merge(df, province_totals, on = province_grouping_cols) 
    
    return df


# Calculate country totals
def country_totals(df):
    
    country_grouping_cols = ['Country_Region', 'date']
    
    country_totals = df.groupby(country_grouping_cols).agg(
        {'cases':'sum', 'recovered':'sum', 'deaths':'sum'})
    
    country_totals.rename(columns = {'cases': 'country_cases',
                                  'recovered':'country_recovered', 
                                  'deaths': 'country_deaths'}, inplace = True)
    
    df = pd.merge(df, country_totals, on = country_grouping_cols) 
    
    return df

In [None]:
# Add state/province/country totals
us1 = us_state_totals(us1)
us1 = country_totals(us1)

world1 = province_totals(world1)
world1 = country_totals(world1)

In [None]:
# Append US with rest of the world
combined_df2 = us1.append(world1)

combined_df2['Province_State'] = combined_df2.Province_State.fillna('')

## Correctly append combined_df2 and post310_df

In [None]:
# Fix the post310_df first to make sure we have all the columns we need
# Derive the state and country total columns
# Post 3/10 data is all at the state level for US & rest of the world
for col in ['cases', 'deaths', 'recovered']:
    new_col = f"state_{col}"
    post310_df[new_col] = post310_df[col]
    
    
post310_df = country_totals(post310_df)


# Also, set the Province_State column to blanks, because Province_State for the US will display county, state.
post310_df['Province_State'] = post310_df.apply(lambda row: "" if row.Country_Region == "US"
                                                else row.Province_State, axis = 1)

In [None]:
combined_df3 = combined_df2.append(post310_df, sort = False).sort_values(['Country_Region', 
                                                                          'Province_State', 'date'])
combined_df3.Province_State = combined_df3.Province_State.fillna('')

## Post 3/23 feature layer

In [None]:
feature_layer_url = "https://services1.arcgis.com/0MSEUqKaxRlEPj5g/ArcGIS/rest/services/ncov_cases_US/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=OBJECTID%2C+Province_State%2C+Country_Region%2C+Last_Update%2C+Lat%2C+Long_%2C+Confirmed%2C+Recovered%2C+Deaths%2C+Active%2C+Admin2%2C+FIPS%2C+Combined_Key%2C+Incident_Rate%2C+People_Tested&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="

cases326 = gpd.read_file(feature_layer_url)

#cases326.to_file(driver = 'GeoJSON', filename = '../data/jhu_feature_layer_3_26_2020.geojson')

In [None]:
cases325 = gpd.read_file('../data/jhu_feature_layer_3_25_2020.geojson')

Need help with Last Update column....it's displaying weird ESRI stuff

In [None]:
cases325['date'] = '3/25/2020'
cases326['date'] = '3/26/2020'

In [None]:
# Append what we have of JHU's new layer so far
post323_df = cases325.append(cases326)

post323_df['date'] = pd.to_datetime(post323_df.date)

In [None]:
def clean_jhu_post323_schema(df):
    
    # Rename columns
    df.rename(columns = {"Long_":"Lon", 
                        "Confirmed":"cases", 
                        "Recovered":"recovered", 
                        "Deaths":"deaths", 
                        "Admin2": "County"} , inplace = True)  
    
    """
    These are the geographic identifiers
    Admin2 = County
    Province_State = US State
    Combined_Key = County, State, Country    
    """
    df['state_abbrev'] = df.Province_State.map(us_state_abbrev)
    df['orig_county'] = df.County + ", " + df.state_abbrev
    
    
    # Now change the columns to match with previous schemas
    # Province_State will now display county, state abbrev (Los Angeles, CA)
    df.Province_State = df.orig_county
    
    # Add state and country totals (JHU only collecting US county data now, no more non-US country observations)
    df = us_state_totals(df)
    df = country_totals(df)
    
    # Drop columns
    df = df.drop(columns = ['County', 'Active'])
    
    return df

In [None]:
post323_df = clean_jhu_post323_schema(post323_df)

## Append combined_df3 and post323_df

In [None]:
combined_df3.columns

In [None]:
combined_df4 = combined_df3.append(post323_df).sort_values(['Country_Region', 
                                                            'Province_State', 'date'])

In [None]:
# Add FIPS
county_fips_crosswalk = post323_df[['orig_county', 'FIPS']].drop_duplicates(subset = ['FIPS'], keep = 'last')

combined_df4 = pd.merge(combined_df4.drop(columns = 'FIPS'), county_fips_crosswalk, 
                        on = 'orig_county', how = 'left')

In [None]:
# Add Combined_Key for US
us2 = combined_df4[combined_df4.Country_Region == "US"]

In [None]:
combined_key_crosswalk = us2[['FIPS', 'Combined_Key']][us2.FIPS.notna()].drop_duplicates()

In [None]:
us2 = pd.merge(us2.drop(columns = 'Combined_Key'), combined_key_crosswalk, on = 'FIPS', how = 'left')

In [None]:
us2[us2.Combined_Key.isna()]

In [None]:
us2['Combined_Key'] = us2.Province_State.str.split(',').str[0] + ", " + us2.state_abbrev + ", " + us2.Country_Region

In [None]:
us2.head()

In [None]:
# Add FIPS, state_abbrev, Combined_Key columns
us['state_abbrev'] = us.Province_State.map(us_state_abbrev)

county_fips_crosswalk = cases3[['orig_county', 'FIPS']].drop_duplicates()

us2 = pd.merge(us, county_fips_crosswalk, on = 'orig_county')

In [None]:
combined_df2 = us2.append(world)

## Append combined_df2 and cases3

In [None]:
from geoalchemy2 import Geometry, WKTElement
from shapely.geometry import Point

# Fill in geometry - may not be necessary because we ESRI uses Lat, Lon columns
srid = 4326
df = df.dropna(subset=['Lat', 'Lon'])
df["geometry"] = df.apply(
    lambda x: WKTElement(Point(x.Lon, x.Lat).wkt, srid=srid), axis=1
)

In [None]:
# Change date column to be datetime....Ian has code for this