# Notebook to create the min-max bands for the age-standardized data using the 195 countries

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import copy
import matplotlib.pyplot as plt
from db_queries import get_location_metadata
%matplotlib inline

In [2]:
haq_data_dir = '/share/scratch/projects/hssa/haq/HAQ_2017/vanilla_cc_89_paf_242/draws/'
my_data_dir = '/share/scratch/projects/hssa/haq/HAQ_2017/haq_US/'
prog_dir = '/homes/arjuns13/haq/haq/risk_standardized_amen_mort/'
my_data_dir_cancer = '/share/scratch/projects/hssa/haq/HAQ_2017/haq_2017_latest_cancer/cancers_age_std_all_most_det_locs/'
years = [2016]#[1990, 1995, 2000, 2005, 2010, 2015, 2016, 2017]

### Using only the countries for getting the min-max values and then the 823 most detailed locations for the subsequent analysis

In [3]:
locsdf = get_location_metadata(gbd_round_id=5, location_set_id=35)
causelist = pd.read_csv(prog_dir+'amenable_cause_list_GBD.csv')

In [4]:
loc_US = locsdf[locsdf['location_id']==102]['location_id']

In [5]:
loc_US

181    102
Name: location_id, dtype: int64

In [None]:
country_lids = locsdf.loc[locsdf['level'] == 3, 'location_id'].values
state_lids = locsdf.loc[locsdf['parent_id']==102,'location_id'].values

In [None]:
most_det_locs = locsdf[locsdf['most_detailed']==1]

In [None]:
most_det_locs = most_det_locs['location_id']

In [None]:
len(list(most_det_locs))

In [None]:
len(country_lids)

In [None]:
def getmaxquantile(x):
    val = x.rsval.quantile(q=0.99)
    return pd.Series({'rs_max':val})

def getminquantile(x):
    val = x.rsval.quantile(q=0.01)
    return pd.Series({'rs_min':val})

In [None]:
consolidated_limits  = {
    297:{'age_start':4,
        'age_end':19},
    302:{'age_start':2,
        'age_end':7},
    322:{'age_start':2,
        'age_end':19},
    328:{'age_start':2,
        'age_end':19},
    338:{'age_start':4,
        'age_end':16},
    339:{'age_start':4,
        'age_end':7},
    340:{'age_start':2,
        'age_end':19},
    341:{'age_start':5,
        'age_end':7},
    366:{'age_start':7,
        'age_end':15},
    380:{'age_start':2,
        'age_end':5},
    849:{'age_start':8,
        'age_end':19},
    429:{'age_start':8,
        'age_end':19},
    432:{'age_start':8,
        'age_end':19},
    435:{'age_start':8,
        'age_end':13},
    441:{'age_start':8,
        'age_end':19},
    468:{'age_start':8,
        'age_end':19},
    484:{'age_start':4,
        'age_end':19},
    487:{'age_start':4,
        'age_end':13},
    492:{'age_start':5,
        'age_end':19},
    493:{'age_start':4,
        'age_end':19},
    494:{'age_start':2,
        'age_end':19},
    498:{'age_start':4,
        'age_end':19},
    508:{'age_start':5,
        'age_end':7},
    527:{'age_start':5,
        'age_end':19},
    529:{'age_start':5,
        'age_end':19},
    531:{'age_start':5,
        'age_end':19},
    534:{'age_start':5,
        'age_end':19},
    545:{'age_start':4,
        'age_end':19},
    587:{'age_start':2,
        'age_end':14},
    589:{'age_start':4,
        'age_end':19},
    643:{'age_start':2,
        'age_end':18},
    708:{'age_start':2,
        'age_end':19}
}
cause_limits = pd.DataFrame(consolidated_limits) 
cause_limits = cause_limits.T.reset_index()
cause_limits = cause_limits.rename(index=str, columns={'index':'cause_id'})

In [None]:
len(consolidated_limits)

In [None]:
most_det_locs.shape

### Combined global data - non cancer for age-standardized and all most detailed locs

In [None]:
global_data_noncancer_ageStd =[]
for y in years:
    print('Processing: ',y)
    for i in list(country_lids):
        print('Processing loc: ',i)
        df = pd.read_csv(my_data_dir+'std_all_most_det_locs_age_std/'+str(y)+'/'+str(i)+'.csv')
        # dropping cancer
        df = df[~df['cause_id'].isin([849, 429, 432, 435, 441, 468, 484, 487])]
        
        df = df.merge(cause_limits,on='cause_id')
        
        global_data_noncancer_ageStd.append(df)
        
global_data_noncancer_ageStd = pd.concat(global_data_noncancer_ageStd, axis=0)

In [None]:
df.head()

In [None]:
global_data_noncancer_ageStd.head()

In [None]:
df.memory_usage()

In [None]:
global_data_noncancer_ageStd.memory_usage()

### Combined global data - cancer

In [None]:
global_data_cancer_ageStd =[]
for y in years:
    print('Processing: ',y)
    for i in list(country_lids):
        print('Processing loc: ',i)
        df = pd.read_csv(my_data_dir_cancer+str(y)+'/'+str(i)+'.csv')
        # keeping only cancer
        df = df[df['cause_id'].isin([849, 429, 432, 435, 441, 468, 484, 487])]
        
        df = df.merge(cause_limits,on='cause_id')
        
        global_data_cancer_ageStd.append(df)
        
global_data_cancer_ageStd = pd.concat(global_data_cancer_ageStd, axis=0)

In [None]:
df.memory_usage()

In [None]:
global_data_cancer_ageStd.memory_usage()

In [None]:
global_data_cancer_ageStd.head()

### Calculating Global Minmax - By age and draw

In [None]:
#global_data_noncancer_ageStd = pd.read_csv(my_data_dir+'combined_data/global_data/' + 'global_noncancer_mostDetLocs_ageStd.csv')
#global_data_cancer_ageStd = pd.read_csv(my_data_dir+'combined_data/global_data/' + 'global_cancer_mostDetLocs_ageStd.csv')

In [None]:
print("####### Non Cancer ###########")
print("Calculating Max across years and location.")
dfmax_nc = global_data_noncancer_ageStd.groupby(['cause_id','age_group_id','draw']).apply(getmaxquantile).reset_index()
print("Calculating Min across years and location.")
dfmin_nc = global_data_noncancer_ageStd.groupby(['cause_id','age_group_id','draw']).apply(getminquantile).reset_index()

print("####### Cancer ###########")

print("Calculating Max across years and location.")
dfmax_c = global_data_cancer_ageStd.groupby(['cause_id','age_group_id','draw']).apply(getmaxquantile).reset_index()
print("Calculating Min across years and location.")
dfmin_c = global_data_cancer_ageStd.groupby(['cause_id','age_group_id','draw']).apply(getminquantile).reset_index()

In [None]:
print("Saving Non-cancer minmax")
min_max_noncancer_ageStd =dfmin_nc.merge(dfmax_nc,on=['cause_id','draw','age_group_id'])
min_max_noncancer_ageStd.to_csv(my_data_dir+'combined_data/min_max/minmax_noncancer_ageStd.csv',index=False)
print("Saving Cancer minmax")
min_max_cancer_ageStd =dfmin_c.merge(dfmax_c,on=['cause_id','draw','age_group_id'])
min_max_cancer_ageStd.to_csv(my_data_dir+'combined_data/min_max/minmax_cancer_ageStd.csv',index=False)

In [None]:
dfmax_nc.shape

In [None]:
dfmin_nc.head()

In [None]:
dfmax_c.shape

In [None]:
dfmax_c.head()

In [None]:
dfmin_c.head()