In [None]:
import pandas as pd
from sqlalchemy import *

### Store CSV into DataFrame

In [None]:
#Fetching state and year wise data for childeren enrolled in or completed college education
csv_file = "Resources/college_enrolled_completed.csv"
raw_college_data_df = pd.read_csv(csv_file)
raw_college_data_df.head()

In [None]:
raw_college_data_df.columns

In [None]:
raw_college_data_df.dtypes

### Create new data with select columns

In [None]:
#Filtring the state data
college_data_df = raw_college_data_df.loc[raw_college_data_df['LocationType'] == 'State']
college_data_df.head()

In [None]:
#Dropping the rows with number data and only keeping percent data
college_data_df = college_data_df[college_data_df.DataFormat != 'Number']
college_data_df

### Clean DataFrame

In [None]:
#Sorting the dataframe by Timeframe
college_data_df = college_data_df.sort_values(by = 'TimeFrame', ascending = False)

#Keeping the data values only for the latest year
college_data_df = college_data_df[college_data_df.TimeFrame == 2016]
college_data_df = college_data_df.reset_index()
#Keeping only the columns that are relevant for analysis
college_data_df = college_data_df[['Location', 'TimeFrame', 'Data']]
college_data_df['Data'] = college_data_df['Data'].map("{:,.1%}".format)
college_data_df

In [None]:
#Fetching state and year wise data for health indicators for children with focus only on the Obesity conditions 

csv_file = "Resources/obesity.csv"
raw_obesity_data_df = pd.read_csv(csv_file)
raw_obesity_data_df.head()

In [None]:
#Filtring the state data
obesity_data_df = raw_obesity_data_df.loc[raw_obesity_data_df['LocationType'] == 'State']
obesity_data_df.head()

In [None]:
#Sorting the dataframe by Timeframe
obesity_data_df = obesity_data_df.sort_values(by = 'TimeFrame', ascending = False)

obesity_data_df.head()

In [None]:
#Keeping only the columns that are relevant for analysis
obesity_data_df = obesity_data_df[['Location', 'TimeFrame', 'Data']]
obesity_data_df

In [None]:
obesity_df = obesity_data_df.groupby(["Location","TimeFrame"]).sum().reset_index()
obesity_df['Data'] = obesity_df['Data'].map("{:,.1%}".format)
obesity_df['TimeFrame'] = obesity_df['TimeFrame'].str.split('-').str[1]
obesity_df

In [None]:
xls = 'Resources/NSDUHsaeTotals2017.xlsx'

#Fetching the data from Sheet - Table 7 which provides Cocaine Use data
drug_df = pd.read_excel(xls, 'Table 7')
drug_df = drug_df.iloc[4:]

#grab the first row for the header
new_header = drug_df.iloc[0]
#take the data less the header row
drug_df = drug_df[1:] 
drug_df.columns = new_header

#Removing the total US row
drug_df = drug_df.iloc[5:] 
drug_df = drug_df.reset_index(drop=True)
drug_df

In [None]:
#Keeping only the columns that are relevant for analysis
drug_df = drug_df[['State', '12-17\nEstimate', '18-25\nEstimate']]
drug_df

In [None]:
#Adding the 2 columns to get the total number of children using cocaine across all US states in year 2017
drug_df['Total Drug Use (Age Group: 12-25)']= drug_df.iloc[:, -3:].sum(axis=1)

drug_df

In [None]:
#Keeping only the columns that are relevant for analysis
drug_df = drug_df[['State', 'Total Drug Use (Age Group: 12-25)']]
drug_df

In [None]:
year = 2017
drug_df['TimeFrame'] = year
drug_df

In [None]:
engine = create_engine('sqlite:///parental_impact_db.sqlite')

In [None]:

engine.table_names()

In [None]:
college_data_df.to_sql('education_enrollment_rate', con=engine, if_exists='append', index = True)

In [None]:
obesity_df.to_sql('obesity_rate', con=engine, if_exists='append', index = True)

In [None]:
drug_df.to_sql('drug_consumption_rate', con=engine, if_exists='append', index = True)

In [None]:
engine.table_names()