In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

### Read the raw dataframe

In [16]:
raw_df = pd.read_csv('data/master_data/john_hopkins_research.csv')
raw_df.drop('Unnamed: 0', axis= 1, inplace= True)

## make certain Last Update is datetime
raw_df['Last Update'] = pd.to_datetime(raw_df['Last Update'])

### US dataframe

In [17]:
us_df = raw_df[raw_df['Country/Region'] == 'US']

## Clean the US df

In [27]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

# thank you to @kinghelix and @trevormarburger for this idea
abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

In [41]:
def get_admin(combined_key):
    if pd.isnull(combined_key):
        county = np.nan
    else:
        county = combined_key.split(',')[0]
    return county

In [88]:
def clean_state(state):
    if 'D.C.' in state or 'U.S.' in state:
        state = state
    elif '(From Diamond Princess)' in state:
        if ',' in state:
            state = abbrev_us_state[state.split('(')[0].split(',')[-1][1:-1]]
        else:
            state = 'Diamond Princess'
    elif ',' in state:
        state = abbrev_us_state[state.split(',')[-1].replace(' ', '')]
        
    return state

In [44]:
us_df['Admin2'] = us_df['Combined_Key'].apply(get_admin)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [89]:
us_df['Province/State'] = us_df['Province/State'].apply(clean_state)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [90]:
### US at the state level
us_states = us_df.groupby(['Province/State', 'Last Update']).sum()

In [94]:
np.unique(list(map(lambda x: x[0], us_states.index)))

array(['Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
       'California', 'Chicago', 'Colorado', 'Connecticut', 'Delaware',
       'Diamond Princess', 'District of Columbia', 'Florida', 'Georgia',
       'Grand Princess', 'Grand Princess Cruise Ship', 'Guam', 'Hawaii',
       'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
       'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
       'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
       'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Northern Mariana Islands',
       'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico',
       'Recovered', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'US', 'United States Virgin Islands', 'Utah',
       'Vermont', 'Virgin Islands', 'Virgin Islands, U.S.', 'Virginia',
       'Washington', 'Washington, D.C.', 'West Virginia', 'Wisconsin',
      

In [98]:
us_states = us_states.reset_index()

In [102]:
us_states = us_states[['Province/State', 'Last Update', 'Confirmed', 'Deaths', 'Recovered', 'Active']]

### Initialize US transformed dataframe

In [103]:
us_transformed = us_states[['Province/State']].copy()

In [106]:
us_transformed.rename(columns={'Province/State': 'State'}, inplace=True)

In [121]:
us_transformed['Date'] = us_states['Last Update'].dt.date

In [109]:
us_transformed['ConfirmedToDate'] = us_states['Confirmed']

In [132]:
us_transformed['Active'] = us_states['Active']

### New Cases

In [131]:
us_transformed['NewConfirmed'] = us_transformed.groupby('State').diff()['ConfirmedToDate']

In [133]:
us_transformed['NewConfirmed'] = np.where(np.isnan(us_transformed['NewConfirmed']), \
                                         us_transformed['ConfirmedToDate'], \
                                         us_transformed['NewConfirmed'])

In [138]:
us_transformed['PrevNewConfirmed'] = us_transformed.groupby('State').shift()['NewConfirmed'].fillna(0)

In [154]:
us_transformed.iloc[:30].groupby('State').shift()

Unnamed: 0,Date,ConfirmedToDate,NewConfirmed,Active,PrevNewConfirmed,GrowthRate
0,,,,,,
1,2020-03-11,5.0,5.0,0.0,0.0,
2,2020-03-14,6.0,1.0,0.0,5.0,0.2
3,2020-03-15,12.0,6.0,0.0,1.0,1.0
4,2020-03-16,29.0,17.0,0.0,6.0,1.416667
5,2020-03-17,39.0,10.0,0.0,17.0,0.344828
6,2020-03-18,46.0,7.0,0.0,10.0,0.179487
7,2020-03-19,78.0,32.0,0.0,7.0,0.695652
8,2020-03-20,83.0,5.0,0.0,32.0,0.064103
9,2020-03-21,131.0,48.0,0.0,5.0,0.578313
