## Extract data from csv files

In [None]:
# Dependencies
import pandas as pd

In [None]:
# Load CSV file as dataframe
nutr_df = pd.read_csv("Resources/Nutrition__Physical_Activity__and_Obesity_-_Women__Infant__and_Child.csv")
nutr_df.head()

## Transform data

In [None]:
# Get a list of column headers
list(nutr_df)

In [None]:
# Get the relevant columns
nutr_df2 = nutr_df[["YearEnd", "LocationDesc", "Question", "Data_Value", 
                    "StratificationID1"]]
nutr_df2.head()

Unnamed: 0,YearEnd,LocationDesc,Question,Data_Value,StratificationID1
0,2008,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.4,OVERALL
1,2008,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.5,MALE
2,2008,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.3,FEMALE
3,2008,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.3,AGEMO2435
4,2008,Alabama,Percent of WIC children aged 2 to 4 years who ...,14.9,AGEMO3647


In [None]:
# Choose data for 2010, 2012, and 2014
Years = [2010, 2012, 2014]

nutr_df3 = nutr_df2[nutr_df2["YearEnd"].isin(Years)]
nutr_df3.head()

Unnamed: 0,YearEnd,LocationDesc,Question,Data_Value,StratificationID1
11,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,16.0,OVERALL
12,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,16.3,MALE
13,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.8,FEMALE
14,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.5,AGEMO2435
15,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,16.3,AGEMO3647


In [None]:
# Choose data with no stratification (stratification ID1 = "overall") 
# and choose values for obese children 2-4 yo
nutr_df4 = nutr_df3.query("StratificationID1 == 'OVERALL'& \
                          Question == 'Percent of WIC children aged 2 to 4 years who have obesity'")
nutr_df4.head()

Unnamed: 0,YearEnd,LocationDesc,Question,Data_Value,StratificationID1
58,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.8,OVERALL
66,2012,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.6,OVERALL
82,2014,Alabama,Percent of WIC children aged 2 to 4 years who ...,16.3,OVERALL
155,2010,Alaska,Percent of WIC children aged 2 to 4 years who ...,21.2,OVERALL
166,2012,Alaska,Percent of WIC children aged 2 to 4 years who ...,20.6,OVERALL


In [None]:
# Further clean the dataset
nutr_df4 = nutr_df4[["YearEnd", "LocationDesc", "Data_Value"]]

# Rename the remaining columns
nutr_df4 = nutr_df4.rename(columns = {"YearEnd": "Year",
                                      "LocationDesc": "US_State",
                                      "Data_Value": "Obese_Children_%"})

# Preview the dataframe
nutr_df4.head()

In [None]:
# Create a table of Location Abbreviations and Descriptions (unique values only)
location = nutr_df4[["US_State"]].drop_duplicates()

# Preview the dataframe
location

## Load data into database

In [9]:
# Dependencies
from sqlalchemy import create_engine
from config import password

In [10]:
# Create a connection to the database
conn = "root:{0}@localhost:3306/diabetes_db".format(password) # Password is in a separate file
engine = create_engine(f"mysql://{conn}")

In [11]:
# Confirm presence of tables
engine.table_names()

  cursor.execute(statement, parameters)


['diabetes', 'location', 'obesity']

In [12]:
# Load dataframes into tables
location.to_sql(name = "location", con = engine, if_exists = "replace", index = False)
nutr_df4.to_sql(name = "obesity", con = engine, if_exists = "replace", index = False)

In [14]:
# Read the table contents (for location)
pd.read_sql_query("select * from location", con = engine).head()

Unnamed: 0,US_State
0,Alabama
1,Alaska
2,Arizona
3,Arkansas
4,California


In [15]:
# Read the table contents (for nutrition)
pd.read_sql_query("select * from obesity", con = engine).head()

Unnamed: 0,Year,US_State,Obese_Children_%
0,2010,Alabama,15.8
1,2012,Alabama,15.6
2,2014,Alabama,16.3
3,2010,Alaska,21.2
4,2012,Alaska,20.6
