Code used to clean the data. Uploaded mainly for the d3 color mapping code 2/3 of the way down, which makes it easier to dynamically change color schemes within a d3 visualization

In [293]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [294]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [295]:
df_main = pd.read_csv('county_health.csv')

In [296]:
### Adds county key for future merges

df_county_key = pd.read_csv('county_key.csv')
df_main = pd.merge(df_main, df_county_key, on='Locale County FIPS Code')

In [297]:
### Adds religious survey data

df_religion = pd.read_csv('religion_counties_2010.csv')
df_main = pd.merge(df_main, df_religion, on='County or Equivalent')

In [298]:
### Adds guttmacher data

df_county_key = pd.read_csv('gutt_clean.csv')
df_main = pd.merge(df_main, df_county_key, on='Locale County FIPS Code')

In [None]:
### Adds 2008 presidential voting data

df_party_key = pd.read_csv('political_party.csv')
df_main = pd.merge(df_main, df_party_key, on='Locale County FIPS Code')

In [299]:
drop_list = ['County or Equivalent', 'State_y', 'County_y', 'Population', 'PopRank',
       'Adherents', 'AdhRank', 'Adh% Rank','Congregations', 'ConRank', 'Con Per 10K Rank']

In [None]:
df_main = df_main.drop(drop_list, axis=1)

In [300]:
### Adds the binary classifier for elevated teen pregnancy, defined as 1/2 one standard deviation from the mean
df_main['High Risk'] = df_main['Teen Pregnancy'].map(lambda x: 1 if x > 56 else 0)

In [301]:
### Adds the binary classifier for elevated chlamydia rates, defined as 1/2 one standard deviation from the mean
df_main['Elevated Chlamydia'] = df_main['Chlamydia Rates per 100,000'].map(lambda x: 1 if x > 375 else 0)

In [302]:
### Adds the binary classifier for elevated child poverty rates, defined as 1/2 one standard deviation from the mean
df_main['Elevated Poverty'] = df_main['% Children in Poverty'].map(lambda x: 1 if x > 25 else 0)

In [303]:
### Adds the binary classifier for lower high school graduation rates, defined as 1/2 one standard deviation below the mean
df_main['Elevated Religiosity'] = df_main['Adherents %'].map(lambda x: 1 if x > 70 else 0)

In [304]:
### Adds the binary classifier for lower high school graduation rates, defined as 1/2 one standard deviation below the mean
df_main['Elevated Uninsured'] = df_main['Total % uninsured women in need of publicly funded contraceptive services,'].map(lambda x: 1 if x > 21 else 0)

In [364]:
df_main['Feature Risk Factor'] = df_main['Elevated Chlamydia'] + df_main['Elevated Poverty'] + df_main['Elevated Religiosity'] + df_main['Elevated Uninsured']

In [365]:
df_main.head(10)

Unnamed: 0,Locale County FIPS Code,State_x,County_x,Teen Pregnancy,% Smokers,% Obese,% Binge Drinking,"Chlamydia Rates per 100,000",Primary Care Physician Rate,No of Medicare enrollees,...,Women aged 20-44 and >250% of poverty level in need of contraceptive services and supplies,Women aged 20-44 and between 100-137% federal poverty level in need of contraceptive services and supplies,Women aged 20-44 and between 138-199% federal poverty level in need of contraceptive services and supplies,Women aged 20-44 and between 200-249% federal poverty level in need of contraceptive services and supplies,High Risk,Elevated Chlamydia,Elevated Poverty,Elevated Religiosity,Elevated Uninsured,Feature Risk Factor
0,1001,Alabama,Autauga,53.0,28.0,30,14.0,370.0,74,9407.0,...,3420,0,550,0,0,0,0,0,0,0
1,1003,Alabama,Baldwin,49.0,23.0,25,18.0,246.0,103,43980.0,...,7300,0,2060,0,0,0,0,0,0,0
2,1005,Alabama,Barbour,75.0,23.0,36,10.0,586.0,50,6546.0,...,700,160,380,130,1,1,1,0,0,2
3,1007,Alabama,Bibb,68.0,,32,10.0,284.0,61,4033.0,...,960,250,310,160,1,0,0,0,0,0
4,1009,Alabama,Blount,57.0,23.0,32,6.0,85.0,35,8110.0,...,0,490,550,440,1,0,0,0,0,0
5,1011,Alabama,Bullock,85.0,,37,5.0,724.0,83,1837.0,...,230,0,140,50,1,1,1,0,1,3
6,1013,Alabama,Butler,66.0,28.0,36,9.0,634.0,54,6433.0,...,0,0,0,110,1,1,1,0,0,2
7,1015,Alabama,Calhoun,60.0,27.0,33,12.0,535.0,110,31000.0,...,0,0,0,0,1,1,0,1,0,2
8,1017,Alabama,Chambers,71.0,17.0,36,10.0,634.0,71,11440.0,...,0,250,0,290,1,1,0,0,1,2
9,1019,Alabama,Cherokee,72.0,27.0,31,15.0,153.0,44,6404.0,...,0,190,0,160,1,0,1,0,1,2


In [366]:
df_main.to_csv('mcnulty.csv')

### Code to map RGB codes by nonile for d3 visualization

Rather than tie the visualization to actual data, this creates a .csv file that maps a color to each value. This makes it easier to change color gradients (red scheme, purple scheme) in the d3 code, because rather than creating a new chart each time we want to change color schemes we are just feeding in RGB codes to an existing chart and changing colors, not data/charts.

In [358]:
teen_preg_list = ["rgb(247,251,255)", "rgb(222,235,247)", "rgb(198,219,239)", "rgb(158,202,225)", "rgb(107,174,214)", "rgb(66,146,198)", "rgb(33,113,181)", "rgb(8,81,156)", "rgb(8,48,107)"]
chlamydia_list = ["rgb(200,229,109)", "rgb(189,224,80)", "rgb(179,219,51)", "rgb(161,200,36)", "rgb(138,171,31)", "rgb(114,142,25)", "rgb(91,114,20)", "rgb(68,85,15)", "rgb(45,56,10)"]
uninsured_list = ["rgb(237,187,153)", "rgb(232,168,124)", "rgb(227,149,95)", "rgb(222,130,66)", "rgb(216,111,38)", "rgb(187,96,33)", "rgb(158,81,28)", "rgb(129,66,23)", "rgb(100,51,18)"]
poverty_list = ["rgb(200,252,255)", "rgb(132,247,255)", "rgb(13,240,255)", "rgb(0,219,234)", "rgb(0,188,200)", "rgb(0,156,166)", "rgb(0,124,132)", "rgb(0,92,98)", "rgb(0,60,64)"]
adherents_list = ["rgb(190,109,229)", "rgb(177,80,224)", "rgb(164,51,219)", "rgb(147,36,200)", "rgb(125,31,171)", "rgb(104,25,142)", "rgb(83,20,114)", "rgb(62,15,85)", "rgb(41,10,56)"]
max_risk_list = ["rgb(220,220,220)", "rgb(220,220,220)", "rgb(220,220,220)", "rgb(220,220,220)", "rgb(220,220,220)", "rgb(220,220,220)", "rgb(220,220,220)", "rgb(220,220,220)", "rgb(120,32,21)"]

In [367]:
df_colors = pd.read_csv('map.csv')

In [368]:
df_colors['preg_rate'] = df_colors['preg_rate'].replace(np.nan, df_colors['preg_rate'].median())

In [359]:
df_colors['preg_rate_color'] = df_colors['preg_rate_index'].map(lambda x: teen_preg_list[x])

In [361]:
df_colors['chlamydia_rate_color'] = df_colors['chlamydia_rate_index'].map(lambda x: chlamydia_list[x])
df_colors['women_uninsured_rate_color'] = df_colors['women_uninsured_rate_index'].map(lambda x: uninsured_list[x])
df_colors['child_poverty_rate_color'] = df_colors['child_poverty_rate_index'].map(lambda x: poverty_list[x])
df_colors['adherents_rate_color'] = df_colors['adherents_rate_index'].map(lambda x: adherents_list[x])
df_colors['feature_rate_color'] = df_colors['risk_rate_index'].map(lambda x: max_risk_list[x])

In [369]:
col_list = ['preg_rate', 'chlamydia_rate', 'child_poverty_rate',
       'adherents_rate', 'women_uninsured_rate', 'feature_risk_rate']

ind_list = ['preg_rate_index',
       'chlamydia_rate_index', 'child_poverty_rate_index',
       'adherents_rate_index', 'women_uninsured_rate_index', 'feature_risk_rate_index']

In [370]:
def pct_rank_qcut(series, n):
    edges = pd.Series([float(i) / n for i in range(n + 1)])
    f = lambda x: (edges >= x).argmax()
    return series.rank(pct=1).apply(f)

In [371]:
for i in col_list:
    col_name = str(i) + '_index'
    df_colors[col_name] = pct_rank_qcut(df_colors[i], 9)

In [348]:
for j in ind_list:
    df_colors[j] = df_colors[j].map(lambda x: x - 1)

In [None]:
df_colors.to_csv('map_colors.csv')