### Compare Subnational to SVI
Compare subnational data to SVI data in the USA.

#### 1. Clean Data

In [1]:
# Import functions
import numpy as np
import pandas as pd
import os

In [2]:
# set wd
wd_path = "C:\\Users\\rcompos\\OneDrive - North Carolina State University\\Documents\\Research\\SVI_Flood_Project"
os.chdir(wd_path)
os.getcwd() # get wd
#os.listdir() # get available data

'C:\\Users\\rcompos\\OneDrive - North Carolina State University\\Documents\\Research\\SVI_Flood_Project'

In [11]:
# Set data paths
sub = pd.read_csv("Subnational\\sub_pov - selections.csv")
svi_cnty = pd.read_csv("SVI\\SVI2020_US_COUNTY.csv")
svi_tra = pd.read_csv("SVI\\SVI2020_US_tract.csv")

In [4]:
# View data

#sub.head(5)
#svi_cnty.head(5)
#svi_tra.head(5)
#sub.dtypes
#svi_cnty.dtypes


In [16]:
# Prep subnational data to add to svi
sub_usa = sub[sub['code'] == 'USA'] # select usa data
#len(sub_usa) # 51, includes D.C.
#print(sub_usa[['STATE']].to_string(index=False)) # get list of states

sub_usa = sub_usa.rename(columns = {'State':'STATE','Mean Rank (SVI)':'sub_mean_rank'}) # rename column to match svi data
sub_usa = sub_usa[['STATE','sub_mean_rank']] # select columns
sub_usa['sub_mean_rank'] = sub_usa['sub_mean_rank'].astype(float)
#print(sub_usa.dtypes)

#sub_usa.head()


In [6]:
# Code to group & filter svi data

# Group svi data by state
#svi_st_m = svi_cnty.groupby('STATE').mean().reset_index()
# svi_st_m_s = svi_st_m.iloc[:,np.r_[0, 11:40]] # filter columns
#len(svi_st_m) # 51, includes D.C.
#svi_st_m_s.head(5)
#print(svi_st_m[['STATE']].to_string(index=False)) # get list of states

In [7]:
# Combine svi & subnational datasets

# Combine by state median
svi_st_med = svi_cnty.groupby('STATE').median().reset_index()
svi_st_med_merged = pd.merge(sub_usa, svi_st_med, on =['STATE'], how = 'left')
#svi_st_med_merged.head()

# Combine by state mean
svi_st_m = svi_cnty.groupby('STATE').mean().reset_index()
svi_st_merged = pd.merge(sub_usa, svi_st_m, on =['STATE'], how = 'left')
#svi_st_merged.head()
#svi_merged.dtypes

# Combine by county
svi_cnty_merged = pd.merge(svi_cnty, sub_usa, on =['STATE'], how = 'left')
#svi_cnty_merged.head()

# Combine by tract
svi_tra_merged = pd.merge(svi_tra, sub_usa, on =['STATE'], how = 'left')
#svi_tra_merged.head()


#### 2. Compare w/ simple linear correlation


*Notes:* Closer to 0 is less correlated & closer to 1 or -1 is more correlated, datasets must be same length

*Source:*  https://www.geeksforgeeks.org/exploring-correlation-in-python/

In [10]:
# Find the pearson correlations matrix

# By state median
svi_st_med_corr = svi_st_med_merged.corr(method = 'pearson') # run correlation
print('Rank by State svi median')
print(svi_st_med_corr[['sub_mean_rank']].sort_values(by='sub_mean_rank', ascending=False).head(7)) # order results
print()

# By state mean
svi_st_corr = svi_st_merged.corr(method = 'pearson') # run correlation
print('Rank by State svi mean')
print(svi_st_corr[['sub_mean_rank']].sort_values(by='sub_mean_rank', ascending=False).head(7)) # order results
print()

# By county
svi_cnty_corr = svi_cnty_merged.corr(method = 'pearson') # run correlation
print('Rank by County')
print(svi_cnty_corr[['sub_mean_rank']].sort_values(by='sub_mean_rank', ascending=False).head(3)) # order results
print()

# By tract
svi_tra_corr = svi_tra_merged.corr(method = 'pearson') # run correlation
#print('Rank by Tract')
#print(svi_tra_corr[['sub_mean_rank']].sort_values(by='sub_mean_rank', ascending=False).head(5)) # order results


Rank by State svi median
               sub_mean_rank
sub_mean_rank       1.000000
EP_NOHSDP           0.638436
EPL_NOHSDP          0.637651
RPL_THEME1          0.607627
SPL_THEME1          0.600100
EP_NOINT            0.579451
RPL_THEMES          0.578293

Rank by State svi mean
               sub_mean_rank
sub_mean_rank       1.000000
EPL_NOHSDP          0.642151
EP_NOHSDP           0.629543
RPL_THEME1          0.610676
RPL_THEMES          0.610454
SPL_THEMES          0.606817
SPL_THEME1          0.605124

Rank by County
               sub_mean_rank
sub_mean_rank       1.000000
EP_AFAM             0.490767
RPL_THEME1          0.473343



Definitions from "SVI2020Documentation"

EPL_NOHSDP: Percentile percentage of persons with no high school diploma (age 25+) estimate
EP_NOHSDP: Percentage of persons with no high school diploma (age 25+) estimate
RPL_THEMES: Overall percentile ranking
SPL_THEMES: Sum of series themes
RPL_THEME1: Percentile ranking for Socioeconomic Status theme summary
SPL_THEME1: Sum of series for Socioeconomic Status theme

Four Themes: Socioeconomic status, househould characteristics, racial & ethnic minority status, housing type/transportation

#### 3. Add in HRLS Data

In [None]:
# paths
hrls_kid = pd.read_csv("HRLS\\USA_children_under_five_2020-03-07.csv")

In [14]:
# explore ata
hrls_kid.head(-5)

Unnamed: 0,latitude,longitude,population
0,29.795139,-90.838472,0.051276
1,29.795972,-90.837917,0.051276
2,29.795417,-90.838194,0.051276
3,29.796250,-90.837639,0.051276
4,29.795139,-90.838750,0.051276
...,...,...,...
124830131,42.549861,-94.179583,0.005594
124830132,42.553750,-94.187083,0.005594
124830133,42.552917,-94.179583,0.005594
124830134,42.551250,-94.180694,0.005594


#### 4. Combine HRLS & Subnational Data

In [None]:
# combine HRLS and subnational data
# new = d.merge(sub_usa, hrls, on =['STATE'], how = 'left')

# take average of the 3 averages