## This generates the HAQ-SDI data to be used for the HAQ-Frontier Analysis. Current notebook is for all most detailed locations and age standardized data.


In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np

from numpy import sign, abs, log10
from scipy.special import logit

# SDI

In [3]:
#sdi = pd.read_csv('J:\\Project\\Cost_Effectiveness\\DCPN_DataViz\\Data\\GBD_data\\covariates\\2018_04_20_sdi.csv')
sdi = pd.read_csv('/snfs1/Project/Cost_Effectiveness/DCPN_DataViz/Data/GBD_data/covariates/2018_04_20_sdi.csv')

In [4]:
sdi.head()

Unnamed: 0.1,Unnamed: 0,model_version_id,covariate_id,covariate_name_short,location_id,location_name,year_id,age_group_id,age_group_name,sex_id,mean_value,lower_value,upper_value
0,1,23297,881,sdi,1,Global,2016,22,All Ages,3,0.649754,0.649754,0.649754
1,2,23297,881,sdi,4,"Southeast Asia, East Asia, and Oceania",2016,22,All Ages,3,0.683368,0.683368,0.683368
2,3,23297,881,sdi,5,East Asia,2016,22,All Ages,3,0.712705,0.712705,0.712705
3,4,23297,881,sdi,6,China,2016,22,All Ages,3,0.712014,0.712014,0.712014
4,5,23297,881,sdi,7,North Korea,2016,22,All Ages,3,0.527559,0.527559,0.527559


In [5]:
sdi['age_group_name'].unique()

array(['All Ages'], dtype=object)

In [6]:
sdi_1 = sdi[['location_id', 'mean_value']]
sdi_1 = sdi_1.rename(columns = {'mean_value': 'sdi'})

In [7]:
sdi_1.shape

(919, 2)

In [8]:
sbp = sdi.boxplot(column='mean_value',by='age_group_name')

In [9]:
sdi_1.head()

Unnamed: 0,location_id,sdi
0,1,0.649754
1,4,0.683368
2,5,0.712705
3,6,0.712014
4,7,0.527559


In [10]:
# Make sure UK is in data (this location is often problematic because it is usually disaggregated)
sdi_1.query('location_id == 95')

Unnamed: 0,location_id,sdi
92,95,0.840568


In [11]:
# logit transformation 

sdi_1['logit_sdi'] = logit(sdi_1['sdi'])

In [12]:
sdi_1.head()

Unnamed: 0,location_id,sdi,logit_sdi
0,1,0.649754,0.617958
1,4,0.683368,0.769294
2,5,0.712705,0.908556
3,6,0.712014,0.905187
4,7,0.527559,0.110348


In [13]:
sdi_1.shape

(919, 3)

In [14]:
sdi_1.describe()

Unnamed: 0,location_id,sdi,logit_sdi
count,919.0,919.0,919.0
mean,21679.757345,0.700113,0.955602
std,20759.983582,0.156078,0.783099
min,1.0,0.194042,-1.423959
25%,484.5,0.605473,0.428324
50%,4916.0,0.731881,1.004186
75%,44701.5,0.833526,1.610826
max,53432.0,0.939852,2.748912


# Age-standardized HAQ across all of the 823 most detailed locations for aggregated cause_id=100

In [15]:
#haq_by_age = pd.read_csv('J:\\Project\\Cost_Effectiveness\\Access_to_care\\temp\\haq_by_age_7.20.18.csv')
#haq_by_age = pd.read_csv('/snfs1/Project/Cost_Effectiveness/Access_to_care/temp/haq_by_age_7.20.18.csv')
import pandas as pd
haq_by_ageStd_cause_allMostDetLocs = pd.read_csv('/share/scratch/projects/hssa/haq/HAQ_2017/haq_US/results/haq_2016_by_allcause_ageStd_allMostDetLocs.csv')

In [16]:
haq_by_ageStd_cause_allMostDetLocs = haq_by_ageStd_cause_allMostDetLocs.drop(columns=['index_mean', 'index_geom_mean'])

In [17]:
haq_by_ageStd_cause_allMostDetLocs.shape

(823, 8)

In [18]:
haq_by_ageStd_cause_allMostDetLocs.head()

Unnamed: 0.1,Unnamed: 0,location_id,age_group_id,year_id,cause_id,index,index_lval,index_uval
0,1,33,27,2016,100,58.84703,57.716748,59.995031
1,2,34,27,2016,100,49.645508,47.571548,51.645469
2,3,35,27,2016,100,52.333543,50.949559,53.626168
3,4,36,27,2016,100,56.310988,54.788049,57.927503
4,5,37,27,2016,100,49.070868,47.962801,50.269645


In [19]:
haq_by_ageStd_cause_allMostDetLocs.describe()

Unnamed: 0.1,Unnamed: 0,location_id,age_group_id,year_id,cause_id,index,index_lval,index_uval
count,823.0,823.0,823.0,823.0,823.0,823.0,823.0,823.0
mean,412.0,23426.015796,27.0,2016.0,100.0,60.048329,58.001686,62.090836
std,237.72393,20781.242124,0.0,0.0,0.0,23.362364,24.14423,22.584093
min,1.0,7.0,27.0,2016.0,100.0,5.143707,2.917744,8.038045
25%,206.5,506.5,27.0,2016.0,100.0,38.923975,36.669203,41.810445
50%,412.0,35449.0,27.0,2016.0,100.0,63.917383,62.369214,65.36369
75%,617.5,44717.5,27.0,2016.0,100.0,82.33828,80.96613,83.686808
max,823.0,53432.0,27.0,2016.0,100.0,92.469471,91.274874,93.647151


### For the 823 most detailed locs

In [20]:
len(haq_by_ageStd_cause_allMostDetLocs['location_id'].unique())

823

### Only age group 27 as the data here is age standardized

In [21]:
len(haq_by_ageStd_cause_allMostDetLocs['age_group_id'].unique())

1

### aggregated cause_id=100

In [22]:
len(haq_by_ageStd_cause_allMostDetLocs['cause_id'].unique())

1

In [23]:
haq_by_ageStd_cause_allMostDetLocs[haq_by_ageStd_cause_allMostDetLocs['index'] == 0].shape

(0, 8)

In [24]:
haq_by_ageStd_cause_allMostDetLocs[haq_by_ageStd_cause_allMostDetLocs['index'] == 0]

Unnamed: 0.1,Unnamed: 0,location_id,age_group_id,year_id,cause_id,index,index_lval,index_uval


In [25]:
#haq_by_ageStd_cause_allMostDetLocs[haq_by_ageStd_cause_allMostDetLocs['index'] == 0]['index'] = 1e-6

#haq_by_ageStd_cause_allMostDetLocs.loc[haq_by_ageStd_cause_allMostDetLocs['index'] == 0, 'index'] = 1e-6

In [26]:
haq_by_ageStd_cause_allMostDetLocs['index'].min()

5.1437066979969295

In [27]:
# Log transformation
haq_by_ageStd_cause_allMostDetLocs['ln_haq'] = np.log(haq_by_ageStd_cause_allMostDetLocs['index'])

In [28]:
haq_by_ageStd_cause_allMostDetLocs['ln_haq'].min()

1.6377739670354008

In [29]:
haq_by_ageStd_cause_allMostDetLocs.shape

(823, 9)

In [30]:
haq_by_ageStd_cause_allMostDetLocs.head()

Unnamed: 0.1,Unnamed: 0,location_id,age_group_id,year_id,cause_id,index,index_lval,index_uval,ln_haq
0,1,33,27,2016,100,58.84703,57.716748,59.995031,4.074941
1,2,34,27,2016,100,49.645508,47.571548,51.645469,3.904908
2,3,35,27,2016,100,52.333543,50.949559,53.626168,3.957638
3,4,36,27,2016,100,56.310988,54.788049,57.927503,4.03089
4,5,37,27,2016,100,49.070868,47.962801,50.269645,3.893266


# Merge SDI and age-specific HAQ

In [31]:
haq_sdi = pd.merge(sdi_1[['location_id', 'logit_sdi']],\
                   haq_by_ageStd_cause_allMostDetLocs[['location_id', 'age_group_id', 'ln_haq']], on='location_id')

In [32]:
haq_sdi.shape

(823, 4)

In [33]:
haq_sdi.head()

Unnamed: 0,location_id,logit_sdi,age_group_id,ln_haq
0,7,0.110348,27,3.792486
1,8,1.820419,27,4.367523
2,10,-0.093432,27,3.509703
3,12,0.046388,27,3.245486
4,13,1.13717,27,4.000794


In [34]:
haq_sdi['age_group_id'].unique()

array([27])

In [35]:
len(haq_sdi['location_id'].unique())

823

In [36]:
# Write out data to CSV 
haq_sdi.to_csv('/ihme/homes/arjuns13/notebooks/Documents/Data/haq_sdi_ageStd_aggregatedCauses_allMostDetLocs.csv')

In [37]:
haq_sdi.to_csv('J:\\Project\\Cost_Effectiveness\\NPC\\Regression_Analysis\\haq_sdi_ageStd_aggregatedCauses_allMostDetLocs.csv')