# ETL & Visualization Project

## UV Exposure & Melanoma Rates Correlation in United States

### Extract: UV Exposure and Melanoma Data (csv)

In [3]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
# from scipy.stats import linregress
# from scipy import stats
# import pingouin as pg # Install pingouin stats package (pip install pingouin)
# import seaborn as sns # Install seaborn data visualization library (pip install seaborn)
# from scipy.stats import pearsonr

yr_list= [2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015]

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

# File to Load
CDI_data_to_load = "CDI_data.csv"

# Read the Population Health Data
CDI_data_pd = pd.read_csv(CDI_data_to_load)

# Display the data table for preview
CDI_data_pd

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueTypeID,...,TopicID,QuestionID,ResponseID,LocationID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2013,2013,CA,California,YRBSS,Alcohol,Alcohol use among youth,,%,CrdPrev,...,ALC,ALC1_1,,6,OVERALL,OVR,,,,
1,2013,2013,CO,Colorado,YRBSS,Alcohol,Alcohol use among youth,,%,CrdPrev,...,ALC,ALC1_1,,8,OVERALL,OVR,,,,
2,2013,2013,CT,Connecticut,YRBSS,Alcohol,Alcohol use among youth,,%,CrdPrev,...,ALC,ALC1_1,,9,OVERALL,OVR,,,,
3,2013,2013,DC,District of Columbia,YRBSS,Alcohol,Alcohol use among youth,,%,CrdPrev,...,ALC,ALC1_1,,11,OVERALL,OVR,,,,
4,2013,2013,DE,Delaware,YRBSS,Alcohol,Alcohol use among youth,,%,CrdPrev,...,ALC,ALC1_1,,10,OVERALL,OVR,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237956,2012,2012,WI,Wisconsin,BRFSS,Older Adults,Proportion of older adults aged 50-64 years wh...,,%,AgeAdjPrev,...,OLD,OLD3_2,,55,RACE,MRC,,,,
237957,2012,2012,WY,Wyoming,BRFSS,Older Adults,Proportion of older adults aged 50-64 years wh...,,%,AgeAdjPrev,...,OLD,OLD3_2,,56,RACE,MRC,,,,
237958,2012,2012,GU,Guam,BRFSS,Older Adults,Proportion of older adults aged 50-64 years wh...,,%,AgeAdjPrev,...,OLD,OLD3_2,,66,RACE,MRC,,,,
237959,2012,2012,PR,Puerto Rico,BRFSS,Older Adults,Proportion of older adults aged 50-64 years wh...,,%,AgeAdjPrev,...,OLD,OLD3_2,,72,RACE,MRC,,,,


In [4]:
# Extracting cancer data

topic_sorted_df = CDI_data_pd.groupby('Topic')
topic_sorted_df
cancer_df = topic_sorted_df.get_group('Cancer') 
cancer_df
cancer_df = cancer_df.sort_values('LocationDesc')
cancer_df[[]]

new_cancer_df = cancer_df[['LocationAbbr','LocationDesc','Topic',
                                        'Question','DataValueType','DataValue']].copy()
new_cancer_df

Unnamed: 0,LocationAbbr,LocationDesc,Topic,Question,DataValueType,DataValue
49892,AL,Alabama,Cancer,Recent Papanicolaou smear use among women aged...,Crude Prevalence,
50608,AL,Alabama,Cancer,"Fecal occult blood test, sigmoidoscopy, or col...",Crude Prevalence,61.8
50607,AL,Alabama,Cancer,"Fecal occult blood test, sigmoidoscopy, or col...",Age-adjusted Prevalence,62.6
50606,AL,Alabama,Cancer,"Fecal occult blood test, sigmoidoscopy, or col...",Crude Prevalence,64
48310,AL,Alabama,Cancer,"Cancer of the prostate, mortality",Average Annual Age-adjusted Rate,26.4
...,...,...,...,...,...,...
48305,WY,Wyoming,Cancer,"Invasive cancer of the prostate, incidence",Average Annual Crude Rate,138.4
48304,WY,Wyoming,Cancer,"Invasive cancer of the prostate, incidence",Average Annual Age-adjusted Rate,127.1
48150,WY,Wyoming,Cancer,"Cancer of the oral cavity and pharynx, mortality",Average Annual Number,17
49865,WY,Wyoming,Cancer,Papanicolaou smear use among adult women aged ...,Crude Prevalence,


In [5]:
incidence_df = new_cancer_df.loc[new_cancer_df['Question'] == 'Invasive melanoma, incidence']
incidence_df

incidence_df = incidence_df.loc[incidence_df['DataValueType'] == 'Average Annual Number']
incidence_df.head()

Unnamed: 0,LocationAbbr,LocationDesc,Topic,Question,DataValueType,DataValue
53122,AL,Alabama,Cancer,"Invasive melanoma, incidence",Average Annual Number,1128
53119,AK,Alaska,Cancer,"Invasive melanoma, incidence",Average Annual Number,81
53128,AZ,Arizona,Cancer,"Invasive melanoma, incidence",Average Annual Number,1135
53125,AR,Arkansas,Cancer,"Invasive melanoma, incidence",Average Annual Number,534
53131,CA,California,Cancer,"Invasive melanoma, incidence",Average Annual Number,7740


In [6]:
mortality_df = new_cancer_df.loc[new_cancer_df['Question'] == 'Melanoma, mortality']
mortality_df

mortality_df = mortality_df.loc[mortality_df['DataValueType'] == 'Average Annual Number']
mortality_df.head()

Unnamed: 0,LocationAbbr,LocationDesc,Topic,Question,DataValueType,DataValue
53280,AL,Alabama,Cancer,"Melanoma, mortality",Average Annual Number,151
53277,AK,Alaska,Cancer,"Melanoma, mortality",Average Annual Number,12
53286,AZ,Arizona,Cancer,"Melanoma, mortality",Average Annual Number,203
53283,AR,Arkansas,Cancer,"Melanoma, mortality",Average Annual Number,93
53289,CA,California,Cancer,"Melanoma, mortality",Average Annual Number,943


In [10]:

# 2nd File to Load
UV_data_to_load = "UV_data.csv"

# Read the Population Health Data
UV_data_df = pd.read_csv(UV_data_to_load)

# # Display the data table for preview
UV_data_df = UV_data_df.groupby("STATENAME", as_index=False)["UV_ Wh/m²"].mean()
UV_data_df.tail()

Unnamed: 0,STATENAME,UV_ Wh/m²
44,Virginia,4181.985075
45,Washington,3594.102564
46,West Virginia,3892.363636
47,Wisconsin,3810.708333
48,Wyoming,4350.521739


### Load: Database (MongoDB)

In [9]:
# Dependencies
import pymongo
import pandas as pd

In [11]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#### Upload Clean Data to Database

#### 1. Melanoma Incidence Data

In [12]:
# Define database and collection
db = client.uv_melanoma_db
collection = db.melanoma_incidence

In [13]:
# Convert the data frame of melanoma incidence data to dictionary
incidence_dict = incidence_df.to_dict("records")
incidence_dict

[{'LocationAbbr': 'AL',
  'LocationDesc': 'Alabama',
  'Topic': 'Cancer',
  'Question': 'Invasive melanoma, incidence',
  'DataValueType': 'Average Annual Number',
  'DataValue': '1128'},
 {'LocationAbbr': 'AK',
  'LocationDesc': 'Alaska',
  'Topic': 'Cancer',
  'Question': 'Invasive melanoma, incidence',
  'DataValueType': 'Average Annual Number',
  'DataValue': '81'},
 {'LocationAbbr': 'AZ',
  'LocationDesc': 'Arizona',
  'Topic': 'Cancer',
  'Question': 'Invasive melanoma, incidence',
  'DataValueType': 'Average Annual Number',
  'DataValue': '1135'},
 {'LocationAbbr': 'AR',
  'LocationDesc': 'Arkansas',
  'Topic': 'Cancer',
  'Question': 'Invasive melanoma, incidence',
  'DataValueType': 'Average Annual Number',
  'DataValue': '534'},
 {'LocationAbbr': 'CA',
  'LocationDesc': 'California',
  'Topic': 'Cancer',
  'Question': 'Invasive melanoma, incidence',
  'DataValueType': 'Average Annual Number',
  'DataValue': '7740'},
 {'LocationAbbr': 'CO',
  'LocationDesc': 'Colorado',
  'Top

In [14]:
# Upload melanoma incidence data to MongoDB
for incidence_data in range(len(incidence_dict)):
    collection.insert_one(incidence_dict[incidence_data])

In [15]:
# Display the MongoDB records created above
melanoma_incidence_records = db.melanoma_incidence.find()
for melanoma_incidence_record in melanoma_incidence_records:
    print(melanoma_incidence_record)

{'_id': ObjectId('5e810f68f0d19231df806547'), 'LocationAbbr': 'AL', 'LocationDesc': 'Alabama', 'Topic': 'Cancer', 'Question': 'Invasive melanoma, incidence', 'DataValueType': 'Average Annual Number', 'DataValue': '1128'}
{'_id': ObjectId('5e810f68f0d19231df806548'), 'LocationAbbr': 'AK', 'LocationDesc': 'Alaska', 'Topic': 'Cancer', 'Question': 'Invasive melanoma, incidence', 'DataValueType': 'Average Annual Number', 'DataValue': '81'}
{'_id': ObjectId('5e810f68f0d19231df806549'), 'LocationAbbr': 'AZ', 'LocationDesc': 'Arizona', 'Topic': 'Cancer', 'Question': 'Invasive melanoma, incidence', 'DataValueType': 'Average Annual Number', 'DataValue': '1135'}
{'_id': ObjectId('5e810f68f0d19231df80654a'), 'LocationAbbr': 'AR', 'LocationDesc': 'Arkansas', 'Topic': 'Cancer', 'Question': 'Invasive melanoma, incidence', 'DataValueType': 'Average Annual Number', 'DataValue': '534'}
{'_id': ObjectId('5e810f68f0d19231df80654b'), 'LocationAbbr': 'CA', 'LocationDesc': 'California', 'Topic': 'Cancer', 'Q

#### 2. Melanoma Mortality Data

In [16]:
# Define database and collection
db = client.uv_melanoma_db
collection = db.melanoma_mortality

In [17]:
# Convert the data frame of melanoma mortality data to dictionary
mortality_dict = mortality_df.to_dict("records")
mortality_dict

[{'LocationAbbr': 'AL',
  'LocationDesc': 'Alabama',
  'Topic': 'Cancer',
  'Question': 'Melanoma, mortality',
  'DataValueType': 'Average Annual Number',
  'DataValue': '151'},
 {'LocationAbbr': 'AK',
  'LocationDesc': 'Alaska',
  'Topic': 'Cancer',
  'Question': 'Melanoma, mortality',
  'DataValueType': 'Average Annual Number',
  'DataValue': '12'},
 {'LocationAbbr': 'AZ',
  'LocationDesc': 'Arizona',
  'Topic': 'Cancer',
  'Question': 'Melanoma, mortality',
  'DataValueType': 'Average Annual Number',
  'DataValue': '203'},
 {'LocationAbbr': 'AR',
  'LocationDesc': 'Arkansas',
  'Topic': 'Cancer',
  'Question': 'Melanoma, mortality',
  'DataValueType': 'Average Annual Number',
  'DataValue': '93'},
 {'LocationAbbr': 'CA',
  'LocationDesc': 'California',
  'Topic': 'Cancer',
  'Question': 'Melanoma, mortality',
  'DataValueType': 'Average Annual Number',
  'DataValue': '943'},
 {'LocationAbbr': 'CO',
  'LocationDesc': 'Colorado',
  'Topic': 'Cancer',
  'Question': 'Melanoma, mortality

In [18]:
# Upload melanoma mortality data to MongoDB
for mortality_data in range(len(mortality_dict)):
    collection.insert_one(mortality_dict[mortality_data])

In [19]:
# Display the MongoDB records created above
melanoma_mortality_records = db.melanoma_mortality.find()
for melanoma_mortality_record in melanoma_mortality_records:
    print(melanoma_mortality_record)

{'_id': ObjectId('5e81103df0d19231df80657b'), 'LocationAbbr': 'AL', 'LocationDesc': 'Alabama', 'Topic': 'Cancer', 'Question': 'Melanoma, mortality', 'DataValueType': 'Average Annual Number', 'DataValue': '151'}
{'_id': ObjectId('5e81103df0d19231df80657c'), 'LocationAbbr': 'AK', 'LocationDesc': 'Alaska', 'Topic': 'Cancer', 'Question': 'Melanoma, mortality', 'DataValueType': 'Average Annual Number', 'DataValue': '12'}
{'_id': ObjectId('5e81103df0d19231df80657d'), 'LocationAbbr': 'AZ', 'LocationDesc': 'Arizona', 'Topic': 'Cancer', 'Question': 'Melanoma, mortality', 'DataValueType': 'Average Annual Number', 'DataValue': '203'}
{'_id': ObjectId('5e81103df0d19231df80657e'), 'LocationAbbr': 'AR', 'LocationDesc': 'Arkansas', 'Topic': 'Cancer', 'Question': 'Melanoma, mortality', 'DataValueType': 'Average Annual Number', 'DataValue': '93'}
{'_id': ObjectId('5e81103df0d19231df80657f'), 'LocationAbbr': 'CA', 'LocationDesc': 'California', 'Topic': 'Cancer', 'Question': 'Melanoma, mortality', 'DataV

#### 3. UV Exposure Data

In [20]:
# Define database and collection
db = client.uv_melanoma_db
collection = db.uv

In [22]:
# Convert the data frame of UV exposure data to dictionary
UV_dict = UV_data_df.to_dict("records")
UV_dict

[{'STATENAME': 'Alabama', 'UV_ Wh/m²': 4505.164179104478},
 {'STATENAME': 'Arizona', 'UV_ Wh/m²': 5528.466666666666},
 {'STATENAME': 'Arkansas', 'UV_ Wh/m²': 4515.346666666666},
 {'STATENAME': 'California', 'UV_ Wh/m²': 4871.413793103448},
 {'STATENAME': 'Colorado', 'UV_ Wh/m²': 4802.730158730159},
 {'STATENAME': 'Connecticut', 'UV_ Wh/m²': 3832.5},
 {'STATENAME': 'Delaware', 'UV_ Wh/m²': 4074.0},
 {'STATENAME': 'District of Columbia', 'UV_ Wh/m²': 4100.0},
 {'STATENAME': 'Florida', 'UV_ Wh/m²': 4743.671641791045},
 {'STATENAME': 'Georgia', 'UV_ Wh/m²': 4563.974842767296},
 {'STATENAME': 'Idaho', 'UV_ Wh/m²': 4170.545454545455},
 {'STATENAME': 'Illinois', 'UV_ Wh/m²': 4117.4607843137255},
 {'STATENAME': 'Indiana', 'UV_ Wh/m²': 4019.3804347826085},
 {'STATENAME': 'Iowa', 'UV_ Wh/m²': 4053.5454545454545},
 {'STATENAME': 'Kansas', 'UV_ Wh/m²': 4572.047619047619},
 {'STATENAME': 'Kentucky', 'UV_ Wh/m²': 4113.825},
 {'STATENAME': 'Louisiana', 'UV_ Wh/m²': 4557.875},
 {'STATENAME': 'Maine', 

In [23]:
# Upload UV exposure data to MongoDB
for UV_data in range(len(UV_dict)):
    collection.insert_one(UV_dict[UV_data])

In [24]:
# Display the MongoDB records created above
UV_records = db.uv.find()
for UV_record in UV_records:
    print(UV_record)

{'_id': ObjectId('5e8110a8f0d19231df8065af'), 'STATENAME': 'Alabama', 'UV_ Wh/m²': 4505.164179104478}
{'_id': ObjectId('5e8110a9f0d19231df8065b0'), 'STATENAME': 'Arizona', 'UV_ Wh/m²': 5528.466666666666}
{'_id': ObjectId('5e8110a9f0d19231df8065b1'), 'STATENAME': 'Arkansas', 'UV_ Wh/m²': 4515.346666666666}
{'_id': ObjectId('5e8110a9f0d19231df8065b2'), 'STATENAME': 'California', 'UV_ Wh/m²': 4871.413793103448}
{'_id': ObjectId('5e8110a9f0d19231df8065b3'), 'STATENAME': 'Colorado', 'UV_ Wh/m²': 4802.730158730159}
{'_id': ObjectId('5e8110a9f0d19231df8065b4'), 'STATENAME': 'Connecticut', 'UV_ Wh/m²': 3832.5}
{'_id': ObjectId('5e8110a9f0d19231df8065b5'), 'STATENAME': 'Delaware', 'UV_ Wh/m²': 4074.0}
{'_id': ObjectId('5e8110a9f0d19231df8065b6'), 'STATENAME': 'District of Columbia', 'UV_ Wh/m²': 4100.0}
{'_id': ObjectId('5e8110a9f0d19231df8065b7'), 'STATENAME': 'Florida', 'UV_ Wh/m²': 4743.671641791045}
{'_id': ObjectId('5e8110a9f0d19231df8065b8'), 'STATENAME': 'Georgia', 'UV_ Wh/m²': 4563.9748

PLAN:
- Dropdown for each states
- Map showing the UV exposure and layers for incidence and mortality


In [None]:
# CLEANING WITH PANDAS - DONE
# MONGODB - DONE
# FLASK APP
# VISUALIZATIONS (JS)
# WEB DEPLOYMENT