In [1]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
from scipy.stats import linregress
from scipy import stats
import pingouin as pg # Install pingouin stats package (pip install pingouin)
import seaborn as sns # Install seaborn data visualization library (pip install seaborn)
from scipy.stats import pearsonr

yr_list= [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015]

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# File to Load
CDI_data_to_load = "CDI_data.csv"

# Read the Population Health Data
CDI_data_pd = pd.read_csv(CDI_data_to_load)

# Display the data table for preview
CDI_data_pd

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueTypeID,...,TopicID,QuestionID,ResponseID,LocationID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2013,2013,CA,California,YRBSS,Alcohol,Alcohol use among youth,,%,CrdPrev,...,ALC,ALC1_1,,6,OVERALL,OVR,,,,
1,2013,2013,CO,Colorado,YRBSS,Alcohol,Alcohol use among youth,,%,CrdPrev,...,ALC,ALC1_1,,8,OVERALL,OVR,,,,
2,2013,2013,CT,Connecticut,YRBSS,Alcohol,Alcohol use among youth,,%,CrdPrev,...,ALC,ALC1_1,,9,OVERALL,OVR,,,,
3,2013,2013,DC,District of Columbia,YRBSS,Alcohol,Alcohol use among youth,,%,CrdPrev,...,ALC,ALC1_1,,11,OVERALL,OVR,,,,
4,2013,2013,DE,Delaware,YRBSS,Alcohol,Alcohol use among youth,,%,CrdPrev,...,ALC,ALC1_1,,10,OVERALL,OVR,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237956,2012,2012,WI,Wisconsin,BRFSS,Older Adults,Proportion of older adults aged 50-64 years wh...,,%,AgeAdjPrev,...,OLD,OLD3_2,,55,RACE,MRC,,,,
237957,2012,2012,WY,Wyoming,BRFSS,Older Adults,Proportion of older adults aged 50-64 years wh...,,%,AgeAdjPrev,...,OLD,OLD3_2,,56,RACE,MRC,,,,
237958,2012,2012,GU,Guam,BRFSS,Older Adults,Proportion of older adults aged 50-64 years wh...,,%,AgeAdjPrev,...,OLD,OLD3_2,,66,RACE,MRC,,,,
237959,2012,2012,PR,Puerto Rico,BRFSS,Older Adults,Proportion of older adults aged 50-64 years wh...,,%,AgeAdjPrev,...,OLD,OLD3_2,,72,RACE,MRC,,,,


In [23]:
# Extracting cancer data

topic_sorted_df = CDI_data_pd.groupby('Topic')
topic_sorted_df
cancer_df = topic_sorted_df.get_group('Cancer') 
cancer_df
cancer_df = cancer_df.sort_values('LocationDesc')
cancer_df[[]]

new_cancer_df = cancer_df[['LocationAbbr','LocationDesc','Topic',
                                        'Question','DataValueType','DataValue']].copy()
new_cancer_df

Unnamed: 0,LocationAbbr,LocationDesc,Topic,Question,DataValueType,DataValue
49892,AL,Alabama,Cancer,Recent Papanicolaou smear use among women aged...,Crude Prevalence,
50608,AL,Alabama,Cancer,"Fecal occult blood test, sigmoidoscopy, or col...",Crude Prevalence,61.8
50607,AL,Alabama,Cancer,"Fecal occult blood test, sigmoidoscopy, or col...",Age-adjusted Prevalence,62.6
50606,AL,Alabama,Cancer,"Fecal occult blood test, sigmoidoscopy, or col...",Crude Prevalence,64
48310,AL,Alabama,Cancer,"Cancer of the prostate, mortality",Average Annual Age-adjusted Rate,26.4
...,...,...,...,...,...,...
48305,WY,Wyoming,Cancer,"Invasive cancer of the prostate, incidence",Average Annual Crude Rate,138.4
48304,WY,Wyoming,Cancer,"Invasive cancer of the prostate, incidence",Average Annual Age-adjusted Rate,127.1
48150,WY,Wyoming,Cancer,"Cancer of the oral cavity and pharynx, mortality",Average Annual Number,17
49865,WY,Wyoming,Cancer,Papanicolaou smear use among adult women aged ...,Crude Prevalence,


In [24]:
incidence_df = new_cancer_df.loc[new_cancer_df['Question'] == 'Invasive melanoma, incidence']
incidence_df

incidence_df = incidence_df.loc[incidence_df['DataValueType'] == 'Average Annual Number']
incidence_df.head()

Unnamed: 0,LocationAbbr,LocationDesc,Topic,Question,DataValueType,DataValue
53122,AL,Alabama,Cancer,"Invasive melanoma, incidence",Average Annual Number,1128
53119,AK,Alaska,Cancer,"Invasive melanoma, incidence",Average Annual Number,81
53128,AZ,Arizona,Cancer,"Invasive melanoma, incidence",Average Annual Number,1135
53125,AR,Arkansas,Cancer,"Invasive melanoma, incidence",Average Annual Number,534
53131,CA,California,Cancer,"Invasive melanoma, incidence",Average Annual Number,7740


In [26]:
mortality_df = new_cancer_df.loc[new_cancer_df['Question'] == 'Melanoma, mortality']
mortality_df

mortality_df = mortality_df.loc[mortality_df['DataValueType'] == 'Average Annual Number']
mortality_df.head()

Unnamed: 0,LocationAbbr,LocationDesc,Topic,Question,DataValueType,DataValue
53280,AL,Alabama,Cancer,"Melanoma, mortality",Average Annual Number,151
53277,AK,Alaska,Cancer,"Melanoma, mortality",Average Annual Number,12
53286,AZ,Arizona,Cancer,"Melanoma, mortality",Average Annual Number,203
53283,AR,Arkansas,Cancer,"Melanoma, mortality",Average Annual Number,93
53289,CA,California,Cancer,"Melanoma, mortality",Average Annual Number,943


In [39]:

# 2nd File to Load
UV_data_to_load = "UV_data.csv"

# Read the Population Health Data
UV_data_df = pd.read_csv(UV_data_to_load)

# # Display the data table for preview
UV_data_df
UV_data_df = UV_data_df.groupby("STATENAME", as_index=False)["UV_ Wh/m²"].mean()

UV_data_df

Unnamed: 0,STATENAME,UV_ Wh/m²
0,Alabama,4505.164179
1,Arizona,5528.466667
2,Arkansas,4515.346667
3,California,4871.413793
4,Colorado,4802.730159
5,Connecticut,3832.5
6,Delaware,4074.0
7,District of Columbia,4100.0
8,Florida,4743.671642
9,Georgia,4563.974843


In [None]:
# CLEANING WITH PANDAS
# MONGODB
# FLASK APP
# VISUALIZATIONS (JS)
# WEB DEPLOYMENT