# 1. Compare DC to all 500 cities

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

cities = pd.read_csv("CHDB_data_city_all v7_1.csv")
print("Dataset dimensions: " + str(cities.shape))

Dataset dimensions: (60500, 22)


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Take a look at the number of unique values for each column
for col_name in cities.columns:
    column = cities[col_name]
    print(col_name + ": "  + str(column.nunique()))

state_abbr: 51
state_fips: 51
place_fips: 397
stpl_fips: 500
city_name: 474
metric_name: 36
group_name: 14
metric_number: 36
group_number: 14
num: 9469
denom: 13957
est: 3217
lci: 2839
uci: 3133
county_indicator: 4
educ_indicator: 3
multiplier_indicator: 2
data_yr_type: 12
geo_level: 1
date_export: 1
version: 1
NOTE - NCHS Disclaimer: 1


In [74]:
# eliminate all extraneous columns (year of collection, state code, etc.)
metrics = cities[['metric_name', 'num', 'denom', 'est']]

# create descrtive statistics for all cities
avg_metrics = metrics.groupby(['metric_name']).mean()

# create dataset specificalle for dc
dc_total = cities[(cities['state_abbr'] == 'DC') & (cities['group_name'] == 'total population')]
dc_vals = dc_total[['metric_name', 'est']]
dc_vals.set_index('metric_name', inplace=True)

#drop gender/ethnic subcategories, keep total population
metrics_totalpop = cities[cities['group_name'] == 'total population']
overall_stats = metrics_totalpop[['metric_name', 'est']].groupby(['metric_name']).describe()

# in what categories is DC an outlier?
# calculate dc's z-score for each metrics
overall_stats.columns = overall_stats.columns.get_level_values(1)
overall_stats['dc_avg'] = dc_vals['est']
overall_stats['dc_z-score'] = (overall_stats['dc_avg'] - overall_stats['mean']) / overall_stats['std']
print("Stats for all 500 cities:")
overall_stats


Stats for all 500 cities:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max,dc_avg,dc_z-score
metric_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Absenteeism,500.0,18.0732,9.341169,0.0,11.8,16.25,22.5,65.8,41.1,2.465088
Air pollution - particulate matter,498.0,9.191165,1.692985,4.9,7.9,9.15,10.3,15.7,10.0,0.477757
Binge drinking,500.0,17.6562,2.684346,9.1,16.0,17.55,19.2,27.4,24.3,2.475016
Breast cancer deaths,492.0,24.428049,6.056955,11.3,20.3,23.7,27.9,50.6,28.0,0.589727
Cardiovascular disease deaths,494.0,210.440486,58.725131,46.7,171.35,202.75,236.825,515.2,233.5,0.392669
Children in Poverty,500.0,22.6254,10.899536,2.4,14.375,22.15,29.725,60.0,25.5,0.263736
Colorectal cancer deaths,492.0,16.099187,4.207326,4.1,13.5,15.7,18.325,34.3,16.7,0.142802
Dental care,500.0,63.196,7.546653,42.3,57.6,63.3,68.7,81.8,74.7,1.524384
Diabetes,500.0,9.9978,2.397196,4.2,8.275,9.8,11.5,21.6,8.9,-0.457952
Frequent mental distress,500.0,12.8332,2.061543,7.9,11.4,12.9,14.3,18.4,11.0,-0.889237


In [75]:
# transform to pivot table for correlation analysis
df = metrics_totalpop[['city_name', 'metric_name', 'est']]
pivot = pd.pivot_table(df, index='city_name', columns='metric_name', values='est')
pivot.shape


(474, 36)

In [76]:
# Drop all determinants except Absenteeism, Binge Drinking, and Segregation
# include the eleven health outcomes
pivot = pivot[['Absenteeism', 'Breast cancer deaths', 'Cardiovascular disease deaths', 
               'Colorectal cancer deaths', 'Diabetes', 'Frequent mental distress', 
               'High blood pressure', 'Life expectancy', 'Low birthweight', 
               'Obesity', 'Opioid overdose deaths', 'Premature deaths (all causes)', 
               'Binge drinking', 'Neighborhood racial/ethnic segregation']]

# print correlation coefficient matrix
corr = pivot.corr()
corr.style.background_gradient(cmap='coolwarm')

metric_name,Absenteeism,Breast cancer deaths,Cardiovascular disease deaths,Colorectal cancer deaths,Diabetes,Frequent mental distress,High blood pressure,Life expectancy,Low birthweight,Obesity,Opioid overdose deaths,Premature deaths (all causes),Binge drinking,Neighborhood racial/ethnic segregation
metric_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Absenteeism,1.0,0.0847477,0.141748,0.13306,0.295375,0.397355,0.303157,-0.432124,0.414155,0.388406,0.42859,0.402769,-0.054962,0.29856
Breast cancer deaths,0.0847477,1.0,0.601428,0.584159,0.211436,0.250258,0.227572,-0.337499,0.339383,0.209285,0.0812774,0.539204,-0.129569,0.189355
Cardiovascular disease deaths,0.141748,0.601428,1.0,0.719364,0.485293,0.542964,0.426949,-0.5673,0.433209,0.450437,0.159796,0.723895,-0.252892,0.268307
Colorectal cancer deaths,0.13306,0.584159,0.719364,1.0,0.430219,0.462394,0.367155,-0.518995,0.433328,0.3693,0.157482,0.669625,-0.194426,0.276904
Diabetes,0.295375,0.211436,0.485293,0.430219,1.0,0.643288,0.856901,-0.666802,0.627521,0.72431,0.318892,0.638199,-0.578695,0.526489
Frequent mental distress,0.397355,0.250258,0.542964,0.462394,0.643288,1.0,0.557447,-0.781074,0.566533,0.72043,0.458298,0.715763,-0.348813,0.349271
High blood pressure,0.303157,0.227572,0.426949,0.367155,0.856901,0.557447,1.0,-0.72463,0.738385,0.734432,0.340757,0.646212,-0.469111,0.510187
Life expectancy,-0.432124,-0.337499,-0.5673,-0.518995,-0.666802,-0.781074,-0.72463,1.0,-0.760371,-0.79897,-0.454029,-0.829765,0.321531,-0.488023
Low birthweight,0.414155,0.339383,0.433209,0.433328,0.627521,0.566533,0.738385,-0.760371,1.0,0.672716,0.388349,0.689986,-0.284499,0.654301
Obesity,0.388406,0.209285,0.450437,0.3693,0.72431,0.72043,0.734432,-0.79897,0.672716,1.0,0.306702,0.668678,-0.298689,0.472192


# 2. Analysis of tracts within DC

In [72]:
# read in csv file
tracts = pd.read_csv('CHDB_data_tract_DC v7_1.csv')

# eliminate extraneous columns, keep only tract, metric name, and metric value
tracts = tracts[['tract_code', 'metric_name', 'est']]

#create pivot table
tracts_pivot = pd.pivot_table(tracts, index='tract_code', columns='metric_name', values='est')
tracts_pivot.columns

Index(['Air pollution - particulate matter', 'Binge drinking',
       'Children in Poverty', 'Dental care', 'Diabetes',
       'Frequent mental distress', 'Frequent physical distress',
       'High blood pressure', 'Housing cost, excessive',
       'Housing with potential lead risk', 'Income Inequality',
       'Lead exposure risk index', 'Life expectancy',
       'Limited access to healthy foods', 'Obesity', 'Physical inactivity',
       'Preventive services', 'Racial/ethnic diversity', 'Smoking',
       'Unemployment', 'Uninsured'],
      dtype='object', name='metric_name')

In [64]:
# print correlation coefficient matrix
tracts_corr = tracts_pivot.corr()
tracts_corr.style.background_gradient(cmap='coolwarm')

metric_name,Air pollution - particulate matter,Binge drinking,Children in Poverty,Dental care,Diabetes,Frequent mental distress,Frequent physical distress,High blood pressure,"Housing cost, excessive",Housing with potential lead risk,Income Inequality,Lead exposure risk index,Life expectancy,Limited access to healthy foods,Obesity,Physical inactivity,Preventive services,Racial/ethnic diversity,Smoking,Unemployment,Uninsured
metric_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Air pollution - particulate matter,1.0,0.246272,-0.512427,0.501706,-0.317022,-0.52951,-0.440332,-0.243303,-0.424176,0.501888,0.439881,0.109938,0.495667,-0.139455,-0.45186,-0.445168,0.394858,0.584949,-0.540547,-0.490796,0.136192
Binge drinking,0.246272,1.0,-0.606637,0.755995,-0.959014,-0.6315,-0.893418,-0.961253,-0.498042,0.258245,0.733092,-0.245455,0.58314,-0.425699,-0.873136,-0.873834,0.760648,0.648172,-0.704789,-0.676719,-0.30054
Children in Poverty,-0.512427,-0.606637,1.0,-0.804725,0.642904,0.813851,0.765431,0.538996,0.639014,-0.419872,-0.78781,0.253649,-0.654432,0.165065,0.759428,0.772997,-0.762175,-0.537087,0.797995,0.623648,0.193374
Dental care,0.501706,0.755995,-0.804725,1.0,-0.815867,-0.967356,-0.951824,-0.719548,-0.691089,0.402078,0.86771,-0.307912,0.779757,-0.319188,-0.935561,-0.973196,0.955765,0.656464,-0.991787,-0.833618,-0.330671
Diabetes,-0.317022,-0.959014,0.642904,-0.815867,1.0,0.677856,0.941774,0.978845,0.527848,-0.265174,-0.782094,0.259459,-0.681995,0.387657,0.939888,0.922592,-0.822218,-0.699842,0.778692,0.745231,0.328576
Frequent mental distress,-0.52951,-0.6315,0.813851,-0.967356,0.677856,1.0,0.871351,0.559045,0.719817,-0.430172,-0.854316,0.304649,-0.757908,0.295177,0.840041,0.90218,-0.903389,-0.616628,0.970882,0.784103,0.260699
Frequent physical distress,-0.440332,-0.893418,0.765431,-0.951824,0.941774,0.871351,1.0,0.874167,0.63726,-0.347108,-0.857636,0.309663,-0.734649,0.380121,0.974373,0.993662,-0.907783,-0.733869,0.9328,0.830479,0.325006
High blood pressure,-0.243303,-0.961253,0.538996,-0.719548,0.978845,0.559045,0.874167,1.0,0.420747,-0.184774,-0.674121,0.242959,-0.617701,0.400325,0.886432,0.850068,-0.74363,-0.651195,0.679373,0.685539,0.320832
"Housing cost, excessive",-0.424176,-0.498042,0.639014,-0.691089,0.527848,0.719817,0.63726,0.420747,1.0,-0.564551,-0.814152,0.012242,-0.568325,0.224783,0.613797,0.65795,-0.663628,-0.414132,0.679919,0.530881,0.192103
Housing with potential lead risk,0.501888,0.258245,-0.419872,0.402078,-0.265174,-0.430172,-0.347108,-0.184774,-0.564551,1.0,0.482333,0.663964,0.330951,-0.160515,-0.304311,-0.364475,0.353949,0.291029,-0.393062,-0.314273,0.0113742
