## Extract data from csv files
The data on weight status for children aged 3 months to 4 years old was derived from from Women, Infant, and Children Participant and Program Characteristics (WIC-PC). The data is organised by state (USA) the csv file was downloaded from [data.gov](https://catalog.data.gov/dataset/nutrition-physical-activity-and-obesity-women-infant-and-child-dfe5d/resource/415dca15-b90a-46c3-8d13-70322ee4628e).

In [1]:
# Dependencies
import pandas as pd

In [2]:
# Load CSV file as dataframe
    # The Resources folder cannot be loaded because of it exceeds the size limit of 
    # GitHub repositories. The link to the Google Drive is found in the technical report
nutr_df = pd.read_csv("Resources/Nutrition__Physical_Activity__and_Obesity_-_Women__Infant__and_Child.csv")
nutr_df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2008,2008,AL,Alabama,"Women, Infants, and Children Participant and P...",Obesity / Weight Status,Obesity / Weight Status,Percent of WIC children aged 2 to 4 years who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q040,VALUE,1,Total,Total,OVR,OVERALL
1,2008,2008,AL,Alabama,"Women, Infants, and Children Participant and P...",Obesity / Weight Status,Obesity / Weight Status,Percent of WIC children aged 2 to 4 years who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q040,VALUE,1,Gender,Male,GEN,MALE
2,2008,2008,AL,Alabama,"Women, Infants, and Children Participant and P...",Obesity / Weight Status,Obesity / Weight Status,Percent of WIC children aged 2 to 4 years who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q040,VALUE,1,Gender,Female,GEN,FEMALE
3,2008,2008,AL,Alabama,"Women, Infants, and Children Participant and P...",Obesity / Weight Status,Obesity / Weight Status,Percent of WIC children aged 2 to 4 years who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q040,VALUE,1,Age (months),24 - 35,AGEMO,AGEMO2435
4,2008,2008,AL,Alabama,"Women, Infants, and Children Participant and P...",Obesity / Weight Status,Obesity / Weight Status,Percent of WIC children aged 2 to 4 years who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q040,VALUE,1,Age (months),36 - 47,AGEMO,AGEMO3647


## Transform data

In [3]:
# Get a list of column headers
list(nutr_df)

['YearStart',
 'YearEnd',
 'LocationAbbr',
 'LocationDesc',
 'Datasource',
 'Class',
 'Topic',
 'Question',
 'Data_Value_Unit',
 'Data_Value_Type',
 'Data_Value',
 'Data_Value_Alt',
 'Data_Value_Footnote_Symbol',
 'Data_Value_Footnote',
 'Low_Confidence_Limit',
 'High_Confidence_Limit ',
 'Sample_Size',
 'Total',
 'Age(months)',
 'Gender',
 'Race/Ethnicity',
 'GeoLocation',
 'ClassID',
 'TopicID',
 'QuestionID',
 'DataValueTypeID',
 'LocationID',
 'StratificationCategory1',
 'Stratification1',
 'StratificationCategoryId1',
 'StratificationID1']

In [4]:
# Get the relevant columns
nutr_df2 = nutr_df[["YearEnd", "LocationDesc", "Question", "Data_Value", 
                    "StratificationID1"]]
nutr_df2.head()

Unnamed: 0,YearEnd,LocationDesc,Question,Data_Value,StratificationID1
0,2008,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.4,OVERALL
1,2008,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.5,MALE
2,2008,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.3,FEMALE
3,2008,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.3,AGEMO2435
4,2008,Alabama,Percent of WIC children aged 2 to 4 years who ...,14.9,AGEMO3647


In [5]:
# Choose data for 2010, 2012, and 2014
Years = [2010, 2012, 2014]

nutr_df3 = nutr_df2[nutr_df2["YearEnd"].isin(Years)]
nutr_df3.head()

Unnamed: 0,YearEnd,LocationDesc,Question,Data_Value,StratificationID1
11,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,16.0,OVERALL
12,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,16.3,MALE
13,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.8,FEMALE
14,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.5,AGEMO2435
15,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,16.3,AGEMO3647


In [6]:
# Choose data with no stratification (stratification ID1 = "overall") 
# and choose values for obese children 2-4 yo
nutr_df4 = nutr_df3.query("StratificationID1 == 'OVERALL'& \
                          Question == 'Percent of WIC children aged 2 to 4 years who have obesity'")
nutr_df4.head()

Unnamed: 0,YearEnd,LocationDesc,Question,Data_Value,StratificationID1
58,2010,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.8,OVERALL
66,2012,Alabama,Percent of WIC children aged 2 to 4 years who ...,15.6,OVERALL
82,2014,Alabama,Percent of WIC children aged 2 to 4 years who ...,16.3,OVERALL
155,2010,Alaska,Percent of WIC children aged 2 to 4 years who ...,21.2,OVERALL
166,2012,Alaska,Percent of WIC children aged 2 to 4 years who ...,20.6,OVERALL


In [7]:
# Further clean the dataset
nutr_df4 = nutr_df4[["YearEnd", "LocationDesc", "Data_Value"]]

# Rename the remaining columns
nutr_df4 = nutr_df4.rename(columns = {"YearEnd": "Year",
                                      "LocationDesc": "US_State",
                                      "Data_Value": "Obese_Children_Percent"})

# Preview the dataframe
nutr_df4.head()

Unnamed: 0,Year,US_State,Obese_Children_Percent
58,2010,Alabama,15.8
66,2012,Alabama,15.6
82,2014,Alabama,16.3
155,2010,Alaska,21.2
166,2012,Alaska,20.6


## Load data into database

In [8]:
# Dependencies
from sqlalchemy import create_engine
from config import password

In [9]:
# Create a connection to the database
conn = "root:{0}@localhost:3306/diabetes_db".format(password) # Password is in a separate file
engine = create_engine(f"mysql://{conn}")

In [10]:
# Confirm presence of tables
engine.table_names()

  cursor.execute(statement, parameters)


['diabetes', 'obesity']

In [11]:
# Load dataframes into tables
nutr_df4.to_sql(name = "obesity", con = engine, if_exists = "replace", index = False)

In [12]:
# Read the table contents (for obesity)
pd.read_sql_query("select * from obesity", con = engine).head()

Unnamed: 0,Year,US_State,Obese_Children_Percent
0,2010,Alabama,15.8
1,2012,Alabama,15.6
2,2014,Alabama,16.3
3,2010,Alaska,21.2
4,2012,Alaska,20.6
