# ETL Project

In [1]:
# Import Dependencies

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2

# Get the password from config file
from config import password

## U.S. State Data

- U.S. State Tables (CSV file). Data source: http://www.whypad.com/posts/excel-spreadsheet-of-us-states/583/
- Quality Of Life by State (CSV file). Data source: https://worldpopulationreview.com/state-rankings/quality-of-life-by-state

### Extraction

In [2]:
# File path
states_csv = 'Resources/Raw_data/us_states.csv'

In [3]:
# Read csv and store in to Pandas Dataframe
col_Names=["STATE", "state_name", "state_abr"]
states_df= pd.read_csv(states_csv,names=col_Names)
states_df.head()

Unnamed: 0,STATE,state_name,state_abr
0,ALABAMA,Alabama,AL
1,ALASKA,Alaska,AK
2,ARIZONA,Arizona,AZ
3,ARKANSAS,Arkansas,AR
4,CALIFORNIA,California,CA


In [4]:
# File path
ranking_csv = 'Resources/Raw_data/state_ranking.csv'

In [5]:
# Read csv and store in to Pandas Dataframe
ranking_df = pd.read_csv(ranking_csv)
ranking_df.head()

Unnamed: 0,State,lifeQualityRank,healthCareRank,educationRank,economyRank
0,Washington,1,4,4,3
1,New Hampshire,2,16,5,13
2,Minnesota,3,10,17,18
3,Utah,4,9,10,2
4,Vermont,5,11,8,29


### Transformation

In [6]:
states_df = states_df.drop(columns=['STATE'])
states_df.head()

Unnamed: 0,state_name,state_abr
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [7]:
ranking_df = ranking_df.rename(columns={'State':'state_name'})
ranking_df.head()

Unnamed: 0,state_name,lifeQualityRank,healthCareRank,educationRank,economyRank
0,Washington,1,4,4,3
1,New Hampshire,2,16,5,13
2,Minnesota,3,10,17,18
3,Utah,4,9,10,2
4,Vermont,5,11,8,29


In [8]:
state = pd.merge(left=states_df, right=ranking_df, how='left', left_on='state_name', right_on='state_name')
state.head()

Unnamed: 0,state_name,state_abr,lifeQualityRank,healthCareRank,educationRank,economyRank
0,Alabama,AL,49,46,50,45
1,Alaska,AK,44,25,47,46
2,Arizona,AZ,34,23,40,10
3,Arkansas,AR,45,49,42,43
4,California,CA,19,7,21,4


In [9]:
state.index.names = ['state_id']
state.head()

Unnamed: 0_level_0,state_name,state_abr,lifeQualityRank,healthCareRank,educationRank,economyRank
state_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Alabama,AL,49,46,50,45
1,Alaska,AK,44,25,47,46
2,Arizona,AZ,34,23,40,10
3,Arkansas,AR,45,49,42,43
4,California,CA,19,7,21,4


In [10]:
# Export Data
state.to_csv(r'Resources/Clean_data/state.csv')

## Public school data
- Public School Data (CSV file) Data Source: https://www.kaggle.com/carlosaguayo/usa-public-schools

### Extraction

In [11]:
# File path
school_csv = 'Resources/Raw_data/public_schools.csv'

In [12]:
# Read csv and store in to Pandas DataFrame
schools_df = pd.read_csv(school_csv)
schools_df.head()

Unnamed: 0,X,Y,OBJECTID,NCESID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,...,VAL_METHOD,VAL_DATE,WEBSITE,LEVEL_,ENROLLMENT,ST_GRADE,END_GRADE,DISTRICTID,FT_TEACHER,SHELTER_ID
0,-81.050895,29.022271,2002,120192008041,SAMSULA ACADEMY,248 N SAMSULA DR,NEW SMYRNA,FL,32168,8762,...,IMAGERY,2014-05-20T00:00:00.000Z,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,1,222,KG,5,1201920,13,NOT AVAILABLE
1,-92.507288,31.180659,2003,220129002344,CAROLINE DORMON JUNIOR HIGH SCHOOL,8906 HWY 165 SOUTH,WOODWORTH,LA,71485,NOT AVAILABLE,...,IMAGERY/OTHER,2015-06-19T00:00:00.000Z,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,1,312,PK,8,2201290,21,NOT AVAILABLE
2,-69.97188,43.908147,2004,230378023129,HARRIET BEECHER STOWE ELEMENTARY,44 MCKEEN STREET,BRUNSWICK,ME,4011,NOT AVAILABLE,...,IMAGERY,2014-05-07T00:00:00.000Z,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,1,739,02,5,2303780,52,NOT AVAILABLE
3,-89.542799,32.728496,2005,280252001118,LEAKE CENTRAL ELEMENTARY SCHOOL,603 HWY. 16 WEST,CARTHAGE,MS,39051,NOT AVAILABLE,...,IMAGERY/OTHER,2010-07-06T00:00:00.000Z,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,1,1159,PK,5,2802520,63,NOT AVAILABLE
4,-94.361775,39.364359,2006,291645000891,KEARNEY ELEM.,902 S JEFFERSON,KEARNEY,MO,64060,8518,...,IMAGERY/OTHER,2016-07-18T00:00:00.000Z,http://nces.ed.gov/GLOBALLOCATOR/sch_info_popu...,1,274,KG,5,2916450,22,NOT AVAILABLE


In [13]:
# File path
state_csv = 'Resources/Clean_data/state.csv'

In [14]:
# Read csv and store in to Pandas DataFrame
state_df = pd.read_csv(state_csv)
state_df.head()

Unnamed: 0,state_id,state_name,state_abr,lifeQualityRank,healthCareRank,educationRank,economyRank
0,0,Alabama,AL,49,46,50,45
1,1,Alaska,AK,44,25,47,46
2,2,Arizona,AZ,34,23,40,10
3,3,Arkansas,AR,45,49,42,43
4,4,California,CA,19,7,21,4


### Transformation

In [15]:
# Cleaning up the DataFrame
# Selecting the columns to be displayed
clean_schools_df = schools_df[['STATE', 'CITY', 'NAME', 'ADDRESS', 'ZIP', 'X', 'Y', 'DISTRICTID', 'ST_GRADE', 'END_GRADE']]
clean_schools_df

Unnamed: 0,STATE,CITY,NAME,ADDRESS,ZIP,X,Y,DISTRICTID,ST_GRADE,END_GRADE
0,FL,NEW SMYRNA,SAMSULA ACADEMY,248 N SAMSULA DR,32168,-81.050895,29.022271,1201920,KG,05
1,LA,WOODWORTH,CAROLINE DORMON JUNIOR HIGH SCHOOL,8906 HWY 165 SOUTH,71485,-92.507288,31.180659,2201290,PK,08
2,ME,BRUNSWICK,HARRIET BEECHER STOWE ELEMENTARY,44 MCKEEN STREET,4011,-69.971880,43.908147,2303780,02,05
3,MS,CARTHAGE,LEAKE CENTRAL ELEMENTARY SCHOOL,603 HWY. 16 WEST,39051,-89.542799,32.728496,2802520,PK,05
4,MO,KEARNEY,KEARNEY ELEM.,902 S JEFFERSON,64060,-94.361775,39.364359,2916450,KG,05
...,...,...,...,...,...,...,...,...,...,...
102365,MI,DETROIT,COVENANT HOUSE ACADEMY DETROIT - SOUTHWEST SITE,1450 25TH ST,48216,-83.085229,42.320632,2600322,09,12
102366,MI,SOUTH ROCKWOOD,FRED W. RITTER ELEMENTARY SCHOOL,5650 CARLETON ROCKWOOD RD,48179,-83.272599,42.062038,2601980,KG,04
102367,MS,DIBERVILLE,DIBERVILLE ELEM,4540 BRODIE ROAD,39540,-88.914089,30.436478,2801770,KG,03
102368,MO,KANSAS CITY,DAVIDSON ELEM.,5100 N HIGHLAND,64118,-94.558365,39.187941,2922800,PK,05


In [16]:
# Dropping null values
clean_schools_df = clean_schools_df.dropna()
clean_schools_df

Unnamed: 0,STATE,CITY,NAME,ADDRESS,ZIP,X,Y,DISTRICTID,ST_GRADE,END_GRADE
0,FL,NEW SMYRNA,SAMSULA ACADEMY,248 N SAMSULA DR,32168,-81.050895,29.022271,1201920,KG,05
1,LA,WOODWORTH,CAROLINE DORMON JUNIOR HIGH SCHOOL,8906 HWY 165 SOUTH,71485,-92.507288,31.180659,2201290,PK,08
2,ME,BRUNSWICK,HARRIET BEECHER STOWE ELEMENTARY,44 MCKEEN STREET,4011,-69.971880,43.908147,2303780,02,05
3,MS,CARTHAGE,LEAKE CENTRAL ELEMENTARY SCHOOL,603 HWY. 16 WEST,39051,-89.542799,32.728496,2802520,PK,05
4,MO,KEARNEY,KEARNEY ELEM.,902 S JEFFERSON,64060,-94.361775,39.364359,2916450,KG,05
...,...,...,...,...,...,...,...,...,...,...
102365,MI,DETROIT,COVENANT HOUSE ACADEMY DETROIT - SOUTHWEST SITE,1450 25TH ST,48216,-83.085229,42.320632,2600322,09,12
102366,MI,SOUTH ROCKWOOD,FRED W. RITTER ELEMENTARY SCHOOL,5650 CARLETON ROCKWOOD RD,48179,-83.272599,42.062038,2601980,KG,04
102367,MS,DIBERVILLE,DIBERVILLE ELEM,4540 BRODIE ROAD,39540,-88.914089,30.436478,2801770,KG,03
102368,MO,KANSAS CITY,DAVIDSON ELEM.,5100 N HIGHLAND,64118,-94.558365,39.187941,2922800,PK,05


In [17]:
# Renaiming Columns
clean_schools_df = clean_schools_df.rename(columns={'STATE':'state_abr', 'CITY':'City', 'NAME':'SchoolName', 'ADDRESS':'Address',
                                                    'ZIP':'Zip', 'DISTRICTID':'DistrictID', 'ST_GRADE':'StartGrade', 'END_GRADE':'EndGrade'})
clean_schools_df.head()

Unnamed: 0,state_abr,City,SchoolName,Address,Zip,X,Y,DistrictID,StartGrade,EndGrade
0,FL,NEW SMYRNA,SAMSULA ACADEMY,248 N SAMSULA DR,32168,-81.050895,29.022271,1201920,KG,5
1,LA,WOODWORTH,CAROLINE DORMON JUNIOR HIGH SCHOOL,8906 HWY 165 SOUTH,71485,-92.507288,31.180659,2201290,PK,8
2,ME,BRUNSWICK,HARRIET BEECHER STOWE ELEMENTARY,44 MCKEEN STREET,4011,-69.97188,43.908147,2303780,02,5
3,MS,CARTHAGE,LEAKE CENTRAL ELEMENTARY SCHOOL,603 HWY. 16 WEST,39051,-89.542799,32.728496,2802520,PK,5
4,MO,KEARNEY,KEARNEY ELEM.,902 S JEFFERSON,64060,-94.361775,39.364359,2916450,KG,5


In [18]:
# Merge table for state_id

public_schools_merge = pd.merge(left=state_df, right=clean_schools_df, how='right', left_on='state_abr', right_on='state_abr')
public_schools_merge.head()

Unnamed: 0,state_id,state_name,state_abr,lifeQualityRank,healthCareRank,educationRank,economyRank,City,SchoolName,Address,Zip,X,Y,DistrictID,StartGrade,EndGrade
0,0.0,Alabama,AL,49.0,46.0,50.0,45.0,LEEDS,LEEDS HIGH SCH,1500 GREENWAVE DRIVE,35094,-86.535935,33.548162,100011,09,12
1,0.0,Alabama,AL,49.0,46.0,50.0,45.0,CULLMAN,GOOD HOPE MIDDLE SCH,216 GOOD HOPE SCHOOL RD.,35057,-86.881057,34.102738,101020,06,8
2,0.0,Alabama,AL,49.0,46.0,50.0,45.0,GUNTERSVILLE,BRINDLEE MT MIDDLE SCH,1050 SCANT CITY ROAD,35976,-86.422337,34.377158,100006,06,8
3,0.0,Alabama,AL,49.0,46.0,50.0,45.0,SMITHS STATION,SMITH STATION FRESHMAN CTR,1150 LEE ROAD 298,36877,-85.099268,32.53659,102070,09,9
4,0.0,Alabama,AL,49.0,46.0,50.0,45.0,CLEVELAND,CLEVELAND ELEM SCH,115 STADIUM DR,35049,-86.572919,33.993417,100420,KG,6


In [19]:
# Selecting the columns to be displayed
public_schools = public_schools_merge[['state_id', 'City', 'SchoolName', 'Address', 'Zip', 'X', 'Y', 'DistrictID', 'StartGrade', 'EndGrade']]
public_schools

Unnamed: 0,state_id,City,SchoolName,Address,Zip,X,Y,DistrictID,StartGrade,EndGrade
0,0.0,LEEDS,LEEDS HIGH SCH,1500 GREENWAVE DRIVE,35094,-86.535935,33.548162,100011,09,12
1,0.0,CULLMAN,GOOD HOPE MIDDLE SCH,216 GOOD HOPE SCHOOL RD.,35057,-86.881057,34.102738,101020,06,08
2,0.0,GUNTERSVILLE,BRINDLEE MT MIDDLE SCH,1050 SCANT CITY ROAD,35976,-86.422337,34.377158,100006,06,08
3,0.0,SMITHS STATION,SMITH STATION FRESHMAN CTR,1150 LEE ROAD 298,36877,-85.099268,32.536590,102070,09,09
4,0.0,CLEVELAND,CLEVELAND ELEM SCH,115 STADIUM DR,35049,-86.572919,33.993417,100420,KG,06
...,...,...,...,...,...,...,...,...,...,...
102363,,SAINT THOMAS,JOSEPH GOMEZ ELEMENTARY SCHOOL,142 ANNAS RETREAT,802,-64.919936,18.338171,7800030,KG,06
102364,,SAINT JOHN,JULIUS E SPRAUVE,14 18 ESTATE ENIGHED,831,-64.793916,18.330464,7800030,KG,08
102365,,SAINT THOMAS,LEONARD DOBER ELEMENTARY SCHOOL,9A 10B KRONPRINDSENS GADE,802,-64.925850,18.342520,7800030,04,06
102366,,SAINT CROIX,RICARDO RICHARDS ELEMENTARY SCCHOOL,491 BARREN SPOT,850,-64.760782,17.725168,7800002,KG,06


In [20]:
# Dropping null values
public_schools = public_schools.dropna()
public_schools

Unnamed: 0,state_id,City,SchoolName,Address,Zip,X,Y,DistrictID,StartGrade,EndGrade
0,0.0,LEEDS,LEEDS HIGH SCH,1500 GREENWAVE DRIVE,35094,-86.535935,33.548162,100011,09,12
1,0.0,CULLMAN,GOOD HOPE MIDDLE SCH,216 GOOD HOPE SCHOOL RD.,35057,-86.881057,34.102738,101020,06,08
2,0.0,GUNTERSVILLE,BRINDLEE MT MIDDLE SCH,1050 SCANT CITY ROAD,35976,-86.422337,34.377158,100006,06,08
3,0.0,SMITHS STATION,SMITH STATION FRESHMAN CTR,1150 LEE ROAD 298,36877,-85.099268,32.536590,102070,09,09
4,0.0,CLEVELAND,CLEVELAND ELEM SCH,115 STADIUM DR,35049,-86.572919,33.993417,100420,KG,06
...,...,...,...,...,...,...,...,...,...,...
100614,49.0,EVANSTON,HORIZON ALTERNATIVE SCHOOL,164 YELLOWCREEK RD,82931,-110.980300,41.259840,5602760,07,12
100615,49.0,CHEYENNE,ANDERSON ELEMENTARY,2204 PLAIN VIEW RD,82009,-104.786858,41.170001,5601980,KG,06
100616,49.0,EVANSTON,DAVIS MIDDLE SCHOOL,837 NO NAME ST,82931,-110.972237,41.260919,5602760,06,08
100617,49.0,SHERIDAN,FT. MACKENZIE,620 LEWIS ST,82801,-106.965030,44.803526,5605695,09,12


In [21]:
# Export Data
public_schools.to_csv(r'Resources/Clean_data/public_schools.csv', index = False)

## Load

In [22]:
# Create engine and connection to  db.

engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/US_quality_db')
connection = engine.connect()

- Postgresql Database table state 

In [23]:
state.to_sql(name='state', con=engine, if_exists='append', index=False)

- Data Postgresql Database table Public School

In [24]:
public_schools.to_sql(name='public_schools', con=engine, if_exists='append', index=False)

In [25]:
connection.close()