# Notebook to create the min-max bands for the age-specific data using the 195 countries

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import copy
import matplotlib.pyplot as plt
from db_queries import get_location_metadata
%matplotlib inline

In [2]:
haq_data_dir = '/share/scratch/projects/hssa/haq/HAQ_2017/vanilla_cc_89_paf_242/draws/'
my_data_dir = '/share/scratch/projects/hssa/haq/HAQ_2017/haq_US/'
prog_dir = '/homes/arjuns13/haq/haq/risk_standardized_amen_mort/'
my_data_dir_cancer = '/share/scratch/projects/hssa/haq/HAQ_2017/haq_2017_latest_cancer/cancers/'
years = [1990, 1995, 2000, 2005, 2010, 2015, 2016, 2017]

### Location -filtering 195 coutries. HAQ min-max calculations are indeed supposed to be done using all of the 195 countries

In [5]:
locsdf = get_location_metadata(gbd_round_id=5, location_set_id=35)
causelist = pd.read_csv(prog_dir+'amenable_cause_list_GBD.csv')

In [6]:
causelist.shape

(32, 8)

In [7]:
country_lids = locsdf.loc[locsdf['level'] == 3, 'location_id'].values
state_lids = locsdf.loc[locsdf['parent_id']==102,'location_id'].values

In [8]:
def getmaxquantile(x):
    val = x.rsval.quantile(q=0.99)
    return pd.Series({'rs_max':val})

def getminquantile(x):
    val = x.rsval.quantile(q=0.01)
    return pd.Series({'rs_min':val})

In [9]:
consolidated_limits  = {
    297:{'age_start':4,
        'age_end':19},
    302:{'age_start':2,
        'age_end':7},
    322:{'age_start':2,
        'age_end':19},
    328:{'age_start':2,
        'age_end':19},
    338:{'age_start':4,
        'age_end':16},
    339:{'age_start':4,
        'age_end':7},
    340:{'age_start':2,
        'age_end':19},
    341:{'age_start':5,
        'age_end':7},
    366:{'age_start':7,
        'age_end':15},
    380:{'age_start':2,
        'age_end':5},
    849:{'age_start':8,
        'age_end':19},
    429:{'age_start':8,
        'age_end':19},
    432:{'age_start':8,
        'age_end':19},
    435:{'age_start':8,
        'age_end':13},
    441:{'age_start':8,
        'age_end':19},
    468:{'age_start':8,
        'age_end':19},
    484:{'age_start':4,
        'age_end':19},
    487:{'age_start':4,
        'age_end':13},
    492:{'age_start':5,
        'age_end':19},
    493:{'age_start':4,
        'age_end':19},
    494:{'age_start':2,
        'age_end':19},
    498:{'age_start':4,
        'age_end':19},
    508:{'age_start':5,
        'age_end':7},
    527:{'age_start':5,
        'age_end':19},
    529:{'age_start':5,
        'age_end':19},
    531:{'age_start':5,
        'age_end':19},
    534:{'age_start':5,
        'age_end':19},
    545:{'age_start':4,
        'age_end':19},
    587:{'age_start':2,
        'age_end':14},
    589:{'age_start':4,
        'age_end':19},
    643:{'age_start':2,
        'age_end':18},
    708:{'age_start':2,
        'age_end':19}
}
cause_limits = pd.DataFrame(consolidated_limits) 
cause_limits = cause_limits.T.reset_index()
cause_limits = cause_limits.rename(index=str, columns={'index':'cause_id'})

In [10]:
len(consolidated_limits)

32

In [11]:
cause_limits

Unnamed: 0,cause_id,age_end,age_start
0,297,19,4
1,302,7,2
2,322,19,2
3,328,19,2
4,338,16,4
5,339,7,4
6,340,19,2
7,341,7,5
8,366,15,7
9,380,5,2


### Combined global data - non cancer

In [None]:
global_data_noncancer =[]
for y in years:
    print('Processing: ',y)
    for i in country_lids:
        df = pd.read_csv(my_data_dir+'standardized/'+str(y)+'/'+str(i)+'.csv')
        # dropping cancer
        df = df[~df['cause_id'].isin([849, 429, 432, 435, 441, 468, 484, 487])]
        
        df = df.merge(cause_limits,on='cause_id')
        df = df[(df['age_group_id'] >= df['age_start'])&(df['age_group_id'] <= df['age_end'])]
        df = df.drop(['age_end','age_start'],axis=1)
        
        global_data_noncancer.append(df)
        
global_data_noncancer = pd.concat(global_data_noncancer, axis=0)

In [None]:
df.head()

In [None]:
global_data_noncancer.head()

In [None]:
df.memory_usage()

In [None]:
global_data_noncancer.memory_usage()

In [None]:
# Filtering out unnecessary age groups and saving.
print("Saving to ", my_data_dir+'combined_data/global_data/')
global_data_noncancer.to_csv(my_data_dir+'combined_data/global_data/' + 'global_noncancer_v2.csv',index=False)

### Combined global data - cancer

In [None]:
global_data_cancer =[]
for y in years:
    print('Processing: ',y)
    for i in country_lids:
        df = pd.read_csv(my_data_dir_cancer+str(y)+'/'+str(i)+'.csv')
        # keeping only cancer
        df = df[df['cause_id'].isin([849, 429, 432, 435, 441, 468, 484, 487])]
        
        df = df.merge(cause_limits,on='cause_id')
        df = df[(df['age_group_id'] >= df['age_start'])&(df['age_group_id'] <= df['age_end'])]
        df = df.drop(['age_end','age_start'],axis=1)
        
        global_data_cancer.append(df)
        
global_data_cancer = pd.concat(global_data_cancer, axis=0)

In [None]:
df.memory_usage()

In [None]:
global_data_cancer.memory_usage()

In [None]:
# Filtering out unnecessary age groups and saving.
print("Saving to ", my_data_dir+'combined_data/global_data/')
global_data_cancer.to_csv(my_data_dir+'combined_data/global_data/' + 'global_cancer_v2.csv',index=False)

### Calculating Global Minmax - By age and draw

In [3]:
global_data_noncancer = pd.read_csv(my_data_dir+'combined_data/global_data/' + 'global_noncancer_v2.csv')
global_data_cancer = pd.read_csv(my_data_dir+'combined_data/global_data/' + 'global_cancer_v2.csv')

In [12]:
print("####### Non Cancer ###########")
print("Calculating Max across years and location.")
dfmax_nc = global_data_noncancer.groupby(['cause_id','age_group_id','draw']).apply(getmaxquantile).reset_index()
print("Calculating Min across years and location.")
dfmin_nc = global_data_noncancer.groupby(['cause_id','age_group_id','draw']).apply(getminquantile).reset_index()

print("####### Cancer ###########")

print("Calculating Max across years and location.")
dfmax_c = global_data_cancer.groupby(['cause_id','age_group_id','draw']).apply(getmaxquantile).reset_index()
print("Calculating Min across years and location.")
dfmin_c = global_data_cancer.groupby(['cause_id','age_group_id','draw']).apply(getminquantile).reset_index()

####### Non Cancer ###########
Calculating Max across years and location.
Calculating Min across years and location.
####### Cancer ###########
Calculating Max across years and location.
Calculating Min across years and location.


In [13]:
print("Saving Non-cancer minmax")
min_max_noncancer =dfmin_nc.merge(dfmax_nc,on=['cause_id','draw','age_group_id'])
min_max_noncancer.to_csv(my_data_dir+'combined_data/min_max/minmax_noncancer_v2.csv',index=False)
print("Saving Cancer minmax")
min_max_cancer =dfmin_c.merge(dfmax_c,on=['cause_id','draw','age_group_id'])
min_max_cancer.to_csv(my_data_dir+'combined_data/min_max/minmax_cancer_v2.csv',index=False)

Saving Non-cancer minmax
Saving Cancer minmax


### Combined data - peer regions (Western Europe, Australasia, S. Latin America etc.)

In [14]:
peer_locs = (65,70,73,96,101,102)
peer_data_noncancer =[]
peer_data_cancer =[]
for y in years:
    print('Processing: ',y)
    for i in peer_locs:
        df = pd.read_csv(my_data_dir+'standardized/'+str(y)+'/'+str(i)+'.csv')
        # dropping cancer
        df = df[~df['cause_id'].isin([849, 429, 432, 435, 441, 468, 484, 487])]
        
        df = df.merge(cause_limits,on='cause_id')
        df = df[(df['age_group_id'] >= df['age_start'])&(df['age_group_id'] <= df['age_end'])]
        df = df.drop(['age_end','age_start'],axis=1)
        
        peer_data_noncancer.append(df)
peer_data_noncancer = pd.concat(peer_data_noncancer, axis=0)

print("Saving to ", my_data_dir+'combined_data/global_data/')
peer_data_noncancer.to_csv(my_data_dir+'combined_data/global_data/' + 'peers_noncancer_v2.csv',index=False)

for y in years:
    print('Processing: ',y)
    for i in peer_locs:
        df = pd.read_csv(my_data_dir_cancer+str(y)+'/'+str(i)+'.csv')
        # keeping only cancer
        df = df[df['cause_id'].isin([849, 429, 432, 435, 441, 468, 484, 487])]
        
        df = df.merge(cause_limits,on='cause_id')
        df = df[(df['age_group_id'] >= df['age_start'])&(df['age_group_id'] <= df['age_end'])]
        df = df.drop(['age_end','age_start'],axis=1)
        
        peer_data_cancer.append(df)
peer_data_cancer = pd.concat(peer_data_cancer, axis=0)

print("Saving to ", my_data_dir+'combined_data/global_data/')
peer_data_cancer.to_csv(my_data_dir+'combined_data/global_data/' + 'peers_cancer_v2.csv',index=False)

Processing:  1990
Processing:  1995
Processing:  2000
Processing:  2005
Processing:  2010
Processing:  2015
Processing:  2016
Processing:  2017
Saving to  /share/scratch/projects/hssa/haq/HAQ_2017/haq_US/combined_data/global_data/
Processing:  1990
Processing:  1995
Processing:  2000
Processing:  2005
Processing:  2010
Processing:  2015
Processing:  2016
Processing:  2017
Saving to  /share/scratch/projects/hssa/haq/HAQ_2017/haq_US/combined_data/global_data/


In [16]:
peer_data_noncancer.cause_id.nunique()

24