## This generates the HAQ-SDI data to be used for the HAQ-Frontier Analysis. Current notebook is for all most detailed locations and the US and age standardized data.


In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np

from numpy import sign, abs, log10
from scipy.special import logit

# SDI

In [3]:
#sdi = pd.read_csv('J:\\Project\\Cost_Effectiveness\\DCPN_DataViz\\Data\\GBD_data\\covariates\\2018_04_20_sdi.csv')
sdi = pd.read_csv('/snfs1/Project/Cost_Effectiveness/DCPN_DataViz/Data/GBD_data/covariates/2018_04_20_sdi.csv')

In [4]:
sdi.head()

Unnamed: 0.1,Unnamed: 0,model_version_id,covariate_id,covariate_name_short,location_id,location_name,year_id,age_group_id,age_group_name,sex_id,mean_value,lower_value,upper_value
0,1,23297,881,sdi,1,Global,2016,22,All Ages,3,0.649754,0.649754,0.649754
1,2,23297,881,sdi,4,"Southeast Asia, East Asia, and Oceania",2016,22,All Ages,3,0.683368,0.683368,0.683368
2,3,23297,881,sdi,5,East Asia,2016,22,All Ages,3,0.712705,0.712705,0.712705
3,4,23297,881,sdi,6,China,2016,22,All Ages,3,0.712014,0.712014,0.712014
4,5,23297,881,sdi,7,North Korea,2016,22,All Ages,3,0.527559,0.527559,0.527559


In [5]:
sdi['age_group_name'].unique()

array(['All Ages'], dtype=object)

In [6]:
sdi_1 = sdi[['location_id', 'mean_value']]
sdi_1 = sdi_1.rename(columns = {'mean_value': 'sdi'})

In [7]:
sdi_1.shape

(919, 2)

In [8]:
sbp = sdi.boxplot(column='mean_value',by='age_group_name')

In [9]:
sdi_1.head()

Unnamed: 0,location_id,sdi
0,1,0.649754
1,4,0.683368
2,5,0.712705
3,6,0.712014
4,7,0.527559


In [10]:
# Make sure UK is in data (this location is often problematic because it is usually disaggregated)
sdi_1.query('location_id == 95')

Unnamed: 0,location_id,sdi
92,95,0.840568


In [11]:
# logit transformation 

sdi_1['logit_sdi'] = logit(sdi_1['sdi'])

In [12]:
sdi_1.head()

Unnamed: 0,location_id,sdi,logit_sdi
0,1,0.649754,0.617958
1,4,0.683368,0.769294
2,5,0.712705,0.908556
3,6,0.712014,0.905187
4,7,0.527559,0.110348


In [13]:
sdi_1.shape

(919, 3)

In [14]:
sdi_1.describe()

Unnamed: 0,location_id,sdi,logit_sdi
count,919.0,919.0,919.0
mean,21679.757345,0.700113,0.955602
std,20759.983582,0.156078,0.783099
min,1.0,0.194042,-1.423959
25%,484.5,0.605473,0.428324
50%,4916.0,0.731881,1.004186
75%,44701.5,0.833526,1.610826
max,53432.0,0.939852,2.748912


# Age-standardized HAQ across all of the 823 most detailed locations for aggregated cause_id=100

In [15]:
#haq_by_age = pd.read_csv('J:\\Project\\Cost_Effectiveness\\Access_to_care\\temp\\haq_by_age_7.20.18.csv')
#haq_by_age = pd.read_csv('/snfs1/Project/Cost_Effectiveness/Access_to_care/temp/haq_by_age_7.20.18.csv')
import pandas as pd
haq_by_ageStd_cause_USonly = pd.read_csv('/share/scratch/projects/hssa/haq/HAQ_2017/haq_US/results/haq_2016_by_allcause_ageStd_USonly.csv')

In [16]:
haq_by_ageStd_cause_USonly = haq_by_ageStd_cause_USonly.drop(columns=['index_mean', 'index_geom_mean'])

In [17]:
haq_by_ageStd_cause_USonly.shape

(1, 8)

In [18]:
haq_by_ageStd_cause_USonly.head()

Unnamed: 0.1,Unnamed: 0,location_id,age_group_id,year_id,cause_id,index,index_lval,index_uval
0,1,102,27,2016,100,79.405234,78.752879,80.120362


In [19]:
haq_by_ageStd_cause_USonly.describe()

Unnamed: 0.1,Unnamed: 0,location_id,age_group_id,year_id,cause_id,index,index_lval,index_uval
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,1.0,102.0,27.0,2016.0,100.0,79.405234,78.752879,80.120362
std,,,,,,,,
min,1.0,102.0,27.0,2016.0,100.0,79.405234,78.752879,80.120362
25%,1.0,102.0,27.0,2016.0,100.0,79.405234,78.752879,80.120362
50%,1.0,102.0,27.0,2016.0,100.0,79.405234,78.752879,80.120362
75%,1.0,102.0,27.0,2016.0,100.0,79.405234,78.752879,80.120362
max,1.0,102.0,27.0,2016.0,100.0,79.405234,78.752879,80.120362


In [22]:
# Log transformation
haq_by_ageStd_cause_USonly['ln_haq'] = np.log(haq_by_ageStd_cause_USonly['index'])

# Merge SDI and age-specific HAQ

In [23]:
haq_sdi = pd.merge(sdi_1[['location_id', 'logit_sdi']],\
                   haq_by_ageStd_cause_USonly[['location_id', 'age_group_id', 'ln_haq']], on='location_id')

In [24]:
haq_sdi.shape

(1, 4)

In [25]:
haq_sdi.head()

Unnamed: 0,location_id,logit_sdi,age_group_id,ln_haq
0,102,1.86831,27,4.374564


In [26]:
haq_sdi['age_group_id'].unique()

array([27])

In [27]:
len(haq_sdi['location_id'].unique())

1

In [28]:
haq_sdi_ageStd_allMostDetLocs = pd.read_csv('/ihme/homes/arjuns13/notebooks/Documents/Data/haq_sdi_ageStd_aggregatedCauses_allMostDetLocs.csv')

In [29]:
haq_sdi_ageStd_allMostDetLocs = haq_sdi_ageStd_allMostDetLocs.drop(columns='Unnamed: 0')

In [31]:
haq_sdi_ageStd_allMostDetLocs.head()

Unnamed: 0,location_id,logit_sdi,age_group_id,ln_haq
0,7,0.110348,27,3.792486
1,8,1.820419,27,4.367523
2,10,-0.093432,27,3.509703
3,12,0.046388,27,3.245486
4,13,1.13717,27,4.000794


In [32]:
haq_sdi_ageStd_allMostDetLocs.shape

(823, 4)

In [33]:
haq_sdi_all = pd.concat([haq_sdi_ageStd_allMostDetLocs, haq_sdi])

In [34]:
haq_sdi_all.head()

Unnamed: 0,location_id,logit_sdi,age_group_id,ln_haq
0,7,0.110348,27,3.792486
1,8,1.820419,27,4.367523
2,10,-0.093432,27,3.509703
3,12,0.046388,27,3.245486
4,13,1.13717,27,4.000794


In [35]:
haq_sdi_all.shape

(824, 4)

In [36]:
# Write out data to CSV 
haq_sdi_all.to_csv('/ihme/homes/arjuns13/notebooks/Documents/Data/haq_sdi_ageStd_aggregatedCauses_allLocsIncUS.csv')

In [37]:
haq_sdi_all.to_csv('J:\\Project\\Cost_Effectiveness\\NPC\\Regression_Analysis\\haq_sdi_ageStd_aggregatedCauses_allLocsIncUS.csv')