In [1]:
import pandas as pd
import numpy as np

### NHIS


In [2]:
### NHIS
nhis_data = pd.read_csv("NIHS Prevalence - USA.csv")

# filter by nihs_available == True
nhis_data = nhis_data[nhis_data["nihs_available"] == "TRUE"]

# filter by demographic_cat = race
nhis_data = nhis_data[nhis_data["demographic_cat"] == "1: Race"]

# columns to keep = disease, disease_value_1, disease_value_2, disease_value_3, disease_value_4, disease_value_5, disease_value_6
nhis_data = nhis_data[
    [
        "disease",
        "disease_value_1",
        "disease_value_2",
        "disease_value_3",
        "disease_value_4",
        "disease_value_5",
        "disease_value_6",
    ]
]

# rename columns disease_value_1 -> white, disease_value_2 -> black, disease_value_3 -> hispanic, disease_value_4 -> asian, disease_value_5 -> indiginous, disease_value_6 -> pacific_islander
nhis_data.columns = [
    "disease",
    "white",
    "black",
    "hispanic",
    "asian",
    "indiginous",
    "pacific_islander",
]

nhis_data

Unnamed: 0,disease,white,black,hispanic,asian,indiginous,pacific_islander
20,arthritis,2200.0,2100.0,1680.0,1200.0,3060.0,1660.0
21,asthma,750.0,910.0,600.0,370.0,950.0,
26,bronchitis,410.0,370.0,240.0,,,
27,cardiovascular disease,1150.0,1000.0,820.0,770.0,1460.0,
36,chronic kidney disease,200.0,310.0,220.0,280.0,,
38,coronary artery disease,570.0,540.0,510.0,440.0,860.0,
43,deafness,1660.0,850.0,1120.0,960.0,1950.0,
47,diabetes,860.0,1310.0,1320.0,1140.0,2350.0,1980.0
76,HTN,2390.0,3220.0,2370.0,2190.0,2720.0,
92,liver failure,180.0,110.0,270.0,180.0,250.0,


In [14]:
race_combined = pd.read_csv('NIHS_race.csv')
race_combined.fillna('Na', inplace=True)
race_combined

Unnamed: 0,disease,white,black,hispanic,asian,indiginous,pacific_islander,source
0,arthritis,2200.0,2100.0,1680.0,1200.00,3060.00,1660.00,NIHS
1,asthma,750.0,910.0,600.0,370,950,Na,NIHS
2,bronchitis,410.0,370.0,240.0,Na,Na,Na,NIHS
3,cardiovascular disease,1150.0,1000.0,820.0,770,1460.00,Na,NIHS
4,chronic kidney disease,200.0,310.0,220.0,280,Na,Na,NIHS
5,coronary artery disease,570.0,540.0,510.0,440,860,Na,NIHS
6,deafness,1660.0,850.0,1120.0,960,1950.00,Na,NIHS
7,diabetes,860.0,1310.0,1320.0,1140.00,2350.00,1980.00,NIHS
8,HTN,2390.0,3220.0,2370.0,2190.00,2720.00,Na,NIHS
9,liver failure,180.0,110.0,270.0,180,250,Na,NIHS


In [16]:
# drop the disease duplicates
df = race_combined.drop_duplicates(subset='disease', keep='first')
df.to_csv('NIHS_race.csv', index=False)

In [17]:
df

Unnamed: 0,disease,white,black,hispanic,asian,indiginous,pacific_islander,source
0,arthritis,2200.0,2100.0,1680.0,1200.00,3060.00,1660.00,NIHS
1,asthma,750.0,910.0,600.0,370,950,Na,NIHS
2,bronchitis,410.0,370.0,240.0,Na,Na,Na,NIHS
3,cardiovascular disease,1150.0,1000.0,820.0,770,1460.00,Na,NIHS
4,chronic kidney disease,200.0,310.0,220.0,280,Na,Na,NIHS
5,coronary artery disease,570.0,540.0,510.0,440,860,Na,NIHS
6,deafness,1660.0,850.0,1120.0,960,1950.00,Na,NIHS
7,diabetes,860.0,1310.0,1320.0,1140.00,2350.00,1980.00,NIHS
8,HTN,2390.0,3220.0,2370.0,2190.00,2720.00,Na,NIHS
9,liver failure,180.0,110.0,270.0,180,250,Na,NIHS


### GBD


In [39]:
# Load the data from Global Burden of Disease
cols_to_keep = ["cause_name", "sex_name", "measure_name", "val"]
gbd_data = pd.read_csv("IHME-GBD_2019_DATA-a2d46dc9-1.csv", usecols=cols_to_keep)

gbd_data.head()

Unnamed: 0,measure_name,sex_name,cause_name,val
0,Prevalence,Male,Cirrhosis and other chronic liver diseases,16565.586016
1,Prevalence,Female,Cirrhosis and other chronic liver diseases,12217.625529
2,Prevalence,Both,Cirrhosis and other chronic liver diseases,14355.808452
3,Prevalence,Male,Tuberculosis,12626.474197
4,Prevalence,Female,Tuberculosis,9911.710526


In [40]:
# Standardize data format

# ## val column here is in n per 100,000 people
# ## Convert this to n per 10,000 for standardization

gbd_data["val"] = gbd_data["val"] / 10

# ## Remove "Both" group from sex_name
gbd_data = gbd_data[gbd_data["sex_name"] != "Both"]

gbd_data

Unnamed: 0,measure_name,sex_name,cause_name,val
0,Prevalence,Male,Cirrhosis and other chronic liver diseases,1656.558602
1,Prevalence,Female,Cirrhosis and other chronic liver diseases,1221.762553
3,Prevalence,Male,Tuberculosis,1262.647420
4,Prevalence,Female,Tuberculosis,991.171053
6,Prevalence,Male,Diarrheal diseases,69.401038
...,...,...,...,...
208,Incidence,Female,Alopecia areata,99.085143
210,Incidence,Male,Total burden related to hepatitis B,9.174568
211,Incidence,Female,Total burden related to hepatitis B,4.201064
213,Incidence,Male,Cardiovascular diseases,116.509885


In [41]:
# make a list select every other row of val
val_list = gbd_data['val'].tolist()
# keep only the second index
val_list = val_list[1::2]

# duplicate each element twice
val_list = [val for val in val_list for _ in (0, 1)]
gbd_data['female'] = val_list

In [42]:
gbd_data = gbd_data.rename(columns={
    'measure_name': 'measure_name',
    'sex_name': 'sex_name',
    'cause_name': 'disease',
    'val': 'male',  # assuming 'val' corresponds to 'male'
})
gbd_data

Unnamed: 0,measure_name,sex_name,disease,male,female
0,Prevalence,Male,Cirrhosis and other chronic liver diseases,1656.558602,1221.762553
1,Prevalence,Female,Cirrhosis and other chronic liver diseases,1221.762553,1221.762553
3,Prevalence,Male,Tuberculosis,1262.647420,991.171053
4,Prevalence,Female,Tuberculosis,991.171053,991.171053
6,Prevalence,Male,Diarrheal diseases,69.401038,81.280525
...,...,...,...,...,...
208,Incidence,Female,Alopecia areata,99.085143,99.085143
210,Incidence,Male,Total burden related to hepatitis B,9.174568,4.201064
211,Incidence,Female,Total burden related to hepatitis B,4.201064,4.201064
213,Incidence,Male,Cardiovascular diseases,116.509885,108.848629


In [44]:
# gbd_data reorder column into reorder and keep only disease male female
gbd_data = gbd_data[['disease', 'male', 'female']]
gbd_data = gbd_data.drop_duplicates(subset='disease', keep='first')
gbd_data.to_csv('gbd_data_gender.csv', index=False)

In [12]:
len(gbd_data.cause_name.unique())

36

In [13]:
set(gbd_data.cause_name).intersection(nhis_data.disease)

set()

In [15]:
gbd_data.cause_name.unique()

array(['Cirrhosis and other chronic liver diseases', 'Tuberculosis',
       'Diarrheal diseases', 'HIV/AIDS', 'Lower respiratory infections',
       'Upper respiratory infections', 'Gastritis and duodenitis',
       'Malaria', "Parkinson's disease", 'Multiple sclerosis',
       'Chagas disease', 'Gallbladder and biliary diseases',
       'Motor neuron disease', 'Mental disorders', 'Pancreatitis',
       "Alzheimer's disease and other dementias", 'Depressive disorders',
       'Bipolar disorder', 'Diabetes mellitus', 'Chronic kidney disease',
       'Syphilis', 'Diabetes mellitus type 2', 'Endometriosis',
       'Diabetes mellitus type 1', 'Osteoarthritis',
       'Urinary tract infections and interstitial nephritis',
       'Total burden related to hepatitis B', 'Cardiovascular diseases',
       'Ischemic heart disease', 'Endocarditis', 'Alopecia areata',
       'Hypertensive heart disease', 'Asthma',
       'Atrial fibrillation and flutter', 'Rheumatoid arthritis',
       'Acne vulgar

In [16]:
nhis_data.disease.unique()

array(['arthritis', 'asthma', 'bronchitis', 'cardiovascular disease',
       'chronic kidney disease', 'coronary artery disease', 'deafness',
       'diabetes', 'HTN', 'liver failure', 'perforated ulcer', 'tinnitus',
       'visual anomalies'], dtype=object)