# What is the risk correlated to COVID-19?  
**US-specific population risk**  
###### Alaa Hassan

> Final: Sep 6 2021

###### Challenge Description




This challenge presents a curated collection of datasets from 20 global sources and asks you to model solutions to key questions that were developed and evaluated by a global frontline of healthcare providers, hospitals, suppliers, and policy makers.

In [None]:
import numpy as np # linear algebra
import pandas as pd 
import matplotlib.pyplot as plt #plotting, math, stats
%matplotlib inline
import seaborn as sns #plotting, regressions, stats
import folium
import json
import requests
from urllib.request import urlopen
import plotly.express as px

**What is the situation worldwide? Where does the US stand?**

In [None]:
#Dataset from the World Health Organization
World = pd.read_csv("../input/httpsourworldindataorgcoronavirussourcedata/full_data(14).csv")

plt.figure(figsize=(21,8)) # Figure size
plt.title('Cases across the world as of April 6, 2020') # Title
World.groupby("location")['total_cases'].max().plot(kind='bar', color='teal')

In [None]:
World.corr().style.background_gradient(cmap='magma')

The longest line represents the world's total, not a specific country.   
If the reported data is 100% correct and properly reported, the **US** has a significantly high total of cases.   


In [None]:
df = pd.read_csv('../input/us-counties-covid-19-dataset/us-counties.csv')

#I droped FIPS column. 
##not relevant for this analysis.
USA=df.drop(['fips','county'], axis = 1) 
USA

In [None]:
plt.figure(figsize=(19,17))
plt.title('Cases by state') # Title
sns.lineplot(x="date", y="cases", hue="state",data=USA, palette="Paired")
plt.xticks(USA.date.unique(), rotation=90) # All values in the x axis rotate 90 degrees
plt.show()

In [None]:
##For ease of visualization
NY=USA.loc[USA['state']== 'New York']
LA=USA.loc[USA['state']== 'Louisiana']
WA=USA.loc[USA['state']== 'Washington']
IL=USA.loc[USA['state']== 'Illinois']
Mich=USA.loc[USA['state']== 'Michigan']
PUR=USA.loc[USA['state']== 'Puerto Rico']


In [None]:
# Concatenate dataframes 
States=pd.concat([NY,LA,WA,IL,PUR,Mich]) 

States=States.sort_values(by=['date'], ascending=True)
States


In [None]:
plt.figure(figsize=(15,9))
plt.title('COVID-19 cases comparison of WA, IL, NY, LA, PR, and Michigan') # Title
sns.lineplot(x="date", y="cases", hue="state",data=States)
plt.xticks(States.date.unique(), rotation=90) # All values in the x axis rotate 90 degrees
plt.show()

In [None]:
USAg=USA.groupby(['date']).max()
USAg

In [None]:
USAg=USAg.sort_values(by=['cases'], ascending=True)
USAg

**df on VULNERABILITIES in the US**

In [None]:
Vuln = pd.read_csv("../input/datafiles/cdcs-social-vulnerability-index-svi-2016-overall-svi-county-level.csv/cdcs-social-vulnerability-index-svi-2016-overall-svi-county-level.csv")

In [None]:
Vuln= Vuln[['state', 'e_uninsur', 'epl_pov','epl_unemp','epl_age65','epl_age17','epl_disabl']]

In [None]:
# converting and overwriting values in column 
Vuln["state"]=Vuln["state"].str.lower()
Vuln["state"]=Vuln["state"].str.title()

In [None]:
Vuln.head()

In [None]:
Vuln.describe()

In [None]:
Vuln.corr().style.background_gradient(cmap='viridis')

df on **illness prevalence**

In [None]:
census = pd.read_csv("../input/datafiles/500-cities-census-tract-level-data-gis-friendly-format-2019-release.csv/500-cities-census-tract-level-data-gis-friendly-format-2019-release.csv")

In [None]:
census.head()

###### Geospatial Analysis

In [None]:
#New DF
df= pd.DataFrame(census)
df['geolocation'] = census['geolocation'].str.replace(r')', '')
df['geolocation'] = census['geolocation'].str.replace(r'(', '')
df['geolocation'].head()
#Add spatial_Influence
spatial_influence = census['access2_crudeprev'] + census['arthritis_crudeprev'] + census['binge_crudeprev'] + census['bphigh_crudeprev'] + census['bpmed_crudeprev'] + census['checkup_crudeprev'] + census['cholscreen_crudeprev'] + census['colon_screen_crudeprev'] + census['copd_crudeprev'] + census['corem_crudeprev'] + census['corew_crudeprev'] + census['csmoking_crudeprev'] + census['dental_crudeprev'] + census['diabetes_crudeprev'] + census['highchol_crudeprev'] + census['kidney_crudeprev'] + census['lpa_crudeprev'] + census['mammouse_crudeprev'] + census['mhlth_crudeprev'] + census['obesity_crudeprev'] + census['paptest_crudeprev'] + census['phlth_crudeprev'] + census['sleep_crudeprev'] + census['stroke_crudeprev'] + census['teethlost_crudeprev']
df['Geographic_Disparity'] = spatial_influence
df['Geographic_Disparity'] = census['Geographic_Disparity'].astype('float16')
df

In [None]:
#COPD Geospatial Analysis
state_geo = f"https://raw.githubusercontent.com/Alaa8082/folium/master/tests/us-states.json"

m = folium.Map(location=[48, -102], zoom_start=3)

folium.Choropleth(
    geo_data=state_geo,
    name="choropleth",
    data= df ,
    columns= ["stateabbr", "Geographic_Disparity"],
    key_on="feature.id",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Geographical Health Disparities",
).add_to(m)

folium.LayerControl().add_to(m)

m

In [None]:
fig = px.choropleth_mapbox(df, geojson=state_geo, locations="stateabbr", color='Geographic_Disparity',
                           color_continuous_scale="Viridis",
                           range_color=(500, 1200),
                           mapbox_style="carto-positron",
                           zoom=2, center = {"lat": 48, "lon": -102},
                           opacity=0.5,
                           labels={'Geographic_Disparity': 'Geographical health disparities'})

fig.show()

In [None]:
#Analysis Spatial COl
df = df[['geolocation','stateabbr','Geographic_Disparity']] 

#Generate XY from the geolocation column
df= pd.DataFrame(df)
xy = pd.DataFrame(df['geolocation'].str.split(',',1).tolist(), columns = ['x','y'])
x = xy['x']
y = xy['y']
df['x'] = x
df['y'] = y
df = df.drop('geolocation', 1)
df

In [None]:
#Resample data
df=df.sample(n = 10000)
df

In [None]:
world_map = folium.Map(location=[48, -102], zoom_start=3, tiles='Stamen Toner')

for lat, lon, value, name in zip(df['x'], df['y'], df['Geographic_Disparity'], df['stateabbr']):
    folium.CircleMarker([lat, lon],
                        radius= 2,
                        popup = ('<strong>State</strong>: ' + str(name).capitalize() + '<br>'
                                '<strong>Geographical health disparities</strong>: ' + str(value) + '<br>'),
                        color='red',
                        
                        fill_color='red',
                        fill_opacity=0.7 ).add_to(world_map)
world_map

In [None]:
census=census[['stateabbr','placename', 'geolocation', 'bphigh_crudeprev',
               'stroke_crudeprev', 'obesity_crudeprev', 'diabetes_crudeprev','arthritis_crudeprev',
               'cancer_crudeprev', 'casthma_crudeprev', 'copd_crudeprev', 'csmoking_crudeprev', 
               'highchol_crudeprev', 'kidney_crudeprev']]
census

In [None]:
#COPD prevalence
plt.figure(figsize=(19,7)) # Figure size
census.groupby("stateabbr")['copd_crudeprev'].max().plot(kind='bar', color='olive')

In [None]:
census=census.replace(to_replace =("ND","OK", "UT", 'AK', 'SD','AL','AR'),
                 value =("North Dakota", "Oklahoma", 'Utah', "Alaska", "South Dakota", "Alabama", "Arkansas"))

In [None]:
census=census.replace(to_replace =("NC","OR", "NV", 'AZ', 'SC','CA','CO'),
                 value =("North Carolina", "Oregon", 'Nevada', "Arizona", "South Carolina", "California", "Colorado"))

In [None]:
census=census.replace(to_replace =("MN","WY", "WV", 'WI', 'WA','VT','VA'),
                 value =("Minnessota", "Wyoming", 'West Virginia', "Wisconsin", "Washington", "Vermont", "Virginia"))

In [None]:
census=census.replace(to_replace =("FL","NE", "MT", 'HI', 'LA','NM','GA','KS'),
                 value =("Florida", "Nebraska", 'Montana', "Hawaii", "Louisiana", "New Mexico", "Georgia", "Kansas"))

In [None]:
census=census.replace(to_replace =("NY","NJ", "OH", 'RI', 'PA','TX','ID','KY'),
                 value =("New York", "New Jersey", 'Ohio', "Rhode Island", "Pennsylvania", "Texas", "Idaho", "Kentucky"))

In [None]:
census=census.replace(to_replace =("CT","DC", "DE", 'IA', 'IL','IN','MD','MA'),
                 value =("Connecticut", "District of Columbia", 'Delaware', "Iowa", "Illinios", "Indiana", "Maryland", "Massachussetts"))

In [None]:
census=census.replace(to_replace =("ME","MI", "MO", 'MS', 'TN'),
                 value =("Maine", "Michigan", 'Missouri', "Mississippi", "Tennessee"))

In [None]:
#arthritis prevalence
plt.figure(figsize=(19,7)) # Figure size
census.groupby("stateabbr")['arthritis_crudeprev'].max().plot(kind='bar', color='peru')

In [None]:
census=census.drop(['placename', 'geolocation'], axis = 1) 
census = census.rename(columns={'stateabbr': 'state'})

In [None]:
census = census.rename(columns={'bphigh_crudeprev': 'high bp prev', 'stroke_crudeprev': 'stroke prev'})


In [None]:
census=census.rename(columns={'diabetes_crudeprev': 'diabetes prev', 'cancer_crudeprev': 'cancer prev', 'arthritis_crudeprev': 'arthritis prev'})

In [None]:
census=census.rename(columns={'casthma_crudeprev': 'asthma prev', 'copd_crudeprev': 'copd prev', 'csmoking_crudeprev': 'smoking prev'})

In [None]:
census=census.rename(columns={'highchol_crudeprev': 'highChol prev', 'kidney_crudeprev': 'kidney prev'})
census

In [None]:
census.describe()

In [None]:
census.corr().style.background_gradient(cmap='cividis')

**df on chronic illnesses in the US**

In [None]:
chronic = pd.read_csv("../input/uschronic/u-s-chronic-disease-indicators-cdi.csv")

In [None]:
# iterating the columns 
for col in chronic.columns: 
    print(col)

In [None]:
chronic=chronic[['locationdesc','topic','question','datavalue']]
#replace NaNs with zeros in the df
chronic=chronic.fillna(0)

In [None]:
chronic = chronic.rename(columns={'locationdesc': 'state','datavalue': 'rate of illness','topic': 'chronic illness','question': 'specific illness'})

In [None]:
chronic.head(3)

In [None]:
plt.figure(figsize=(22,6)) # Figure size
plt.title('US chronic illnesses') # Title
sns.countplot(chronic['chronic illness'])
plt.xticks(rotation=45)

In [None]:
chronic.describe()

In [None]:
chronic.corr().style.background_gradient(cmap='cool')

**df of illness ranking**

In [None]:
rank = pd.read_csv("../input/uscounty/us-county-health-rankings-2020.csv")


In [None]:
rank=rank[['state','num_deaths', 'percent_female','percent_excessive_drinking', 
           'num_uninsured','percent_vaccinated','percent_black','percent_american_indian_alaska_native',
           'percent_asian', 'percent_native_hawaiian_other_pacific_islander', 'percent_hispanic', 
           'percent_non_hispanic_white']]
rank.head()

In [None]:
plt.figure(figsize=(16,8)) # Figure size
plt.title('States pre-COVID19 morbidity ranks') # Title
rank.groupby("state")['num_deaths'].max().plot(kind='bar', color='darkred')

In [None]:
rank.describe()

In [None]:
rank.corr().style.background_gradient(cmap='inferno')

**df on COVID-19 Statistics**

In [None]:
stats = pd.read_csv("../input/covidstatistics/covid-statistics-by-us-states-daily-updates.csv")
#replace NaNs with zeros in the df
stats=stats.fillna(0)


In [None]:
# iterating the columns 
for col in stats.columns: 
    print(col)

In [None]:
stats.drop(['hash', 'fips', 'datechecked'], axis=1, inplace=True)
stats.head()

In [None]:
plt.figure(figsize=(14,8)) # Figure size
plt.title('total tests') # Title
stats.groupby("state")['totaltestresults'].max().plot(kind='bar', color='steelblue')

In [None]:
stats=stats[['date', 'state','positive','negative','hospitalized', 'death']]
stats.head()

In [None]:
stats=stats.replace(to_replace ="WA",
                 value ="Washington")

In [None]:
stats=stats.replace(to_replace ="SC", 
                 value ="South Carolina")

In [None]:
stats=stats.replace(to_replace =("NJ","FL", 'AL', "TX", "OR"),
                 value =("New Jersey", "Florida", "Alabama", "Texas", "Oregon"))

In [None]:
stats=stats.replace(to_replace =("AR","AZ", "NY", "CA", "AK"),
                 value =("Arkansas", "Arizona", 'New York', "California", "Alaska"))

In [None]:
stats=stats.replace(to_replace =("MT","WI", "NC", 'OH',"RI", "VA"),
                 value =("Montana", "Wisconsin", 'North Carolina','Ohio', "Rhode Island", 'Virginia'))

In [None]:
stats=stats.replace(to_replace =("TN","GA", "IL", 'NH', "MA"),
                 value =("Tennessee", "Georgia", 'Illinios', "New Hampshire", "Massachussetts"))

In [None]:
stats=stats.replace(to_replace =("CO","CT", "DC", 'DE', "GU"),
                 value =("Colorado", "Connecticut", 'District of Columbia', "Delaware", "Guam"))

In [None]:
stats=stats.replace(to_replace =("HI","IA", "ID", 'IN', "KS", 'KY'),
                 value =("Hawaii", "Iowa", 'Idaho', "Indiana", "Kansas", "Kentucky"))

In [None]:
stats=stats.replace(to_replace =("LA","MD", "MN", 'MI', "MO", 'MS'),
                 value =("Louisiana", "Maryland", 'Minnessota', "Michigan", "Missouri", "Missippippi"))

In [None]:
stats=stats.replace(to_replace =("ME","NV", "WV", 'NM', 'PA', "VT"),
                 value =("Maine", "Nevada", 'West Virginia', "New Mexico", "Pennsylvania", "Vermont"))

In [None]:
stats=stats.replace(to_replace =("ND","OK", "UT", 'PR', 'SD'),
                 value =("North Dakota", "Oklahoma", 'Utah', "Puerto Rico", "South Dakota"))

In [None]:
stats=stats.replace(to_replace =("VI","WY", "NE"),
                 value =("Virgin Islands", "Wyoming", "Nebraska"))

In [None]:
stats.head(3)

In [None]:
stats.describe()

In [None]:
stats.corr().style.background_gradient(cmap='plasma')

# What could be the risks across the US?   
In combining some of the dataframes provided by ROCHE, I visualize below some factors along with #COVID19 data.   
Further statistical analysis would be needed to reach scientific conclusion in this data.   
However, the presentation here could help in identifying future research angles relating to risk factors and   
COVID-19.   

In [None]:
# Merging the dataframes                       
a=pd.merge(USA, stats, how ='inner', on =('state', "date"))
a

In [None]:
dfs1=pd.concat([a,rank,chronic], sort=True) 
dfs1.head()

In [None]:
# Merging the dataframes                       
b=pd.concat([dfs1, Vuln], sort=False) 

In [None]:
# Merging the dataframes                       
c=pd.concat([b, census], sort=False) 


In [None]:
#replace NaNs with zeros in the df
c=c.fillna(0)
c.head()

In [None]:
# iterating the columns to list their names
for col in c.columns: 
    print(col)

In [None]:
# Grouped df by date and state and extract a number of stats from each group
d=c.groupby(
   ['date', 'state'], as_index = False
).agg(
    {
         'hospitalized':max,    # max values 
         'cases':max,
         'deaths': max,
         'num_uninsured':max, 
         'percent_vaccinated': max, 
         'num_uninsured': max,
         'percent_american_indian_alaska_native':max,        
         'percent_asian':max,
         'percent_black':max,        
        'percent_excessive_drinking':max,
        'percent_female':max,
        'percent_hispanic':max,
        'percent_native_hawaiian_other_pacific_islander':max,
        'percent_non_hispanic_white':max,
        'epl_pov':max,
        'epl_unemp': max,
        'epl_age65':max,
        'epl_age17':max,
        'epl_disabl':max,
        'high bp prev':max,
        'stroke prev':max,
        'obesity_crudeprev':max,
        'diabetes prev':max,
        'arthritis prev':max,
        'cancer prev':max,
        'asthma prev':max,
        'copd prev':max,
        'smoking prev':max,
        'highChol prev':max,
        'kidney prev':max
         
    }
)
d

In [None]:
sub1=d[d.date==0]
sub2=d[d.date!=0]

In [None]:
sub2=sub2[['state', 'cases', 'deaths', 'hospitalized']]
sub2.head()

In [None]:
# Merging the dataframes                       
risks=pd.merge(sub1, sub2, how ='inner', on ='state')
risks=risks.drop(['date'], axis = 1) 


In [None]:
sum_column = risks["hospitalized_x"] + risks["hospitalized_y"]
risks["hospitalized"] = sum_column


In [None]:
risks=risks.drop(['hospitalized_x','hospitalized_y'], axis = 1) 

In [None]:
sum_column2 = risks["cases_x"] + risks["cases_y"]
risks["cases"] = sum_column2
sum_column3 = risks["deaths_x"] + risks["deaths_y"]
risks["deaths"] = sum_column3

In [None]:
risks=risks.drop(['cases_x','cases_y', 'deaths_x','deaths_y'], axis = 1) 
risks

In [None]:
# Grouped df by date and state and extract a number of stats from each group
r=risks.groupby(
   ['state'], as_index = False).agg(    
    {
         'hospitalized':max,    # max values 
         'cases':max,
         'deaths': max,
         'num_uninsured':max, 
         'percent_vaccinated': max, 
         'num_uninsured': max,
         'percent_american_indian_alaska_native':max,        
         'percent_asian':max,
         'percent_black':max,        
        'percent_excessive_drinking':max,
        'percent_female':max,
        'percent_hispanic':max,
        'percent_native_hawaiian_other_pacific_islander':max,
        'percent_non_hispanic_white':max,
        'epl_pov':max,
        'epl_unemp': max,
        'epl_age65':max,
        'epl_age17':max,
        'epl_disabl':max,
        'high bp prev':max,
        'stroke prev':max,
        'obesity_crudeprev':max,
        'diabetes prev':max,
        'arthritis prev':max,
        'cancer prev':max,
        'asthma prev':max,
        'copd prev':max,
        'smoking prev':max,
        'highChol prev':max,
        'kidney prev':max
         
    }
)

r

In [None]:
r.describe()

In [None]:
r.corr().style.background_gradient(cmap='cubehelix')

While not verified, there could be correlation among risk factors presented above.   
For example, there seems to be correlation between deaths, cases, and hospitalization.      
However, a statistically-sound correlation does not mean causation.   