In [1]:
!pip install sodapy

Collecting sodapy
  Downloading sodapy-2.1.0-py2.py3-none-any.whl (14 kB)
Installing collected packages: sodapy
Successfully installed sodapy-2.1.0


In [17]:
#!/usr/bin/env python

# make sure to install these packages before running:
# pip install pandas
# pip install sodapy

import pandas as pd
from sodapy import Socrata

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cdc.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("9mfq-cb36", limit=10000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)



There are currently 60 public health jurisdictions reporting cases of COVID-19. This includes:
- the 50 states
- District of Columbia
- New York City
- U.S. territories
    - American Samoa
    - Guam
    - the Commonwealth of the Northern Mariana Islands
    - Puerto Rico
    - the U.S Virgin Islands
- three independent countries in compacts of free association with the United States
    - Federated States of Micronesia
    - Republic of the Marshall Islands 
    - Republic of Palau. 
    
Note: New York State’s reported case and death counts do not include New York City’s counts as they separately report nationally notifiable conditions to CDC.

In [71]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   submission_date  10000 non-null  object 
 1   state            10000 non-null  object 
 2   tot_cases        10000 non-null  int64  
 3   new_case         10000 non-null  object 
 4   tot_death        10000 non-null  float64
 5   new_death        10000 non-null  object 
 6   created_at       10000 non-null  object 
 7   consent_cases    9470 non-null   object 
 8   consent_deaths   9470 non-null   object 
 9   conf_cases       3911 non-null   object 
 10  prob_cases       3911 non-null   object 
 11  pnew_case        6672 non-null   object 
 12  conf_death       4440 non-null   object 
 13  prob_death       4440 non-null   object 
 14  pnew_death       6670 non-null   object 
dtypes: float64(1), int64(1), object(13)
memory usage: 1.1+ MB


In [70]:
results_df['tot_cases'] = results_df['tot_cases'].astype(int)
results_df['tot_death'] = results_df['tot_death'].astype(float)

In [80]:
def get_max_cases_per_state(df):
    """
    This is a function that returns the max number of cases per state, given a dataframe of info
    
    """
    unique_states = list(df['state'].unique())
    max_cases = []
    for x in range(len(unique_states)):
        df_with_that_state = df.loc[df['state'] == unique_states[x]]
        max_row = df_with_that_state.loc[df_with_that_state['tot_cases'] == df_with_that_state['tot_cases'].max()]
        max_cases.append(max_row['tot_cases'].iloc[0])
    max_case_df = pd.DataFrame(list(zip(unique_states, max_cases)), columns = ['State', 'Max Cases']).sort_values(by= ['Max Cases'], ascending = False).reset_index(drop = True)
    max_case_df['rank'] = max_case_df.index + 1
    max_case_df.set_index(max_case_df['rank'], inplace = True)
    max_case_df.drop(columns = ['rank'], inplace = True)
    return max_case_df

get_max_cases_per_state(results_df)

Unnamed: 0,State,Max Cases
0,CA,850028
1,TX,795126
2,FL,726934
3,GA,332311
4,IL,324930
5,NYC,252069
6,NC,232747
7,AZ,226050
8,TN,217682
9,NJ,214097


In [86]:
def get_max_deaths_per_state(df):
    """
    This is a function that returns the max number of deaths per state, given a dataframe of info
    
    """
    unique_states = list(df['state'].unique())
    max_cases = []
    for x in range(len(unique_states)):
        df_with_that_state = df.loc[df['state'] == unique_states[x]]
        max_row = df_with_that_state.loc[df_with_that_state['tot_death'] == df_with_that_state['tot_death'].max()]
        max_cases.append(max_row['tot_death'].iloc[0])
    max_death_df = pd.DataFrame(list(zip(unique_states, max_cases)), columns = ['State', 'Max Deaths']).sort_values(by= ['Max Deaths'], ascending = False).reset_index(drop = True)
    max_death_df['rank'] = max_death_df.index + 1
    max_death_df.set_index(max_death_df['rank'], inplace = True)
    max_death_df.drop(columns = ['rank'], inplace = True)
    return max_death_df

get_max_deaths_per_state(results_df)

Unnamed: 0_level_0,State,Max Deaths
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,NYC,23895.0
2,CA,16572.0
3,TX,16558.0
4,NJ,16175.0
5,FL,15412.0
6,MA,9609.0
7,IL,9243.0
8,NY,8863.0
9,PA,8368.0
10,GA,7429.0
