In [1]:
# Importing dependencies
import pandas as pd 
from sqlalchemy import create_engine

In [2]:
# Establishing SQL connection
rds_connection_string = "postgres:Tenors2015@localhost:5432/education_taxes_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [3]:
# Importing tax data.
csv = 'tax_data.csv'
dirty_taxes = pd.read_csv(csv)
dirty_taxes.head()

Unnamed: 0,STATEFIPS,STATE,zipcode,agi_stub,N1,ELF,CPREP,PREP,DIR_DEP,N2,NUMDEP,TOTAL_VITA,VITA,TCE,VITA_EIC
0,1,AL,0,1,768120,696930,37470,399160,559820,1180240,439980,24700,16610,8100,5300
1,1,AL,0,2,503430,457510,23180,266880,371440,977450,348420,12230,8350,3880,350
2,1,AL,0,3,274590,248630,13210,157800,179490,587740,182370,3110,1590,1520,0
3,1,AL,0,4,174830,159190,6830,102340,110320,429360,131170,990,510,480,0
4,1,AL,0,5,245150,224280,10500,145010,123560,665630,213100,770,730,40,0


In [4]:
# Filtering the data to show only IN values & dropping unneeded columns.
ind_tax=dirty_taxes.loc[dirty_taxes["STATE"]=='IN']
ind_tax = ind_tax.drop(['STATEFIPS', 'agi_stub', 'STATE', 'VITA', 'TCE', 'VITA_EIC'], axis=1)
ind_tax.reset_index(drop=True, inplace=True)
ind_tax.head()

Unnamed: 0,zipcode,N1,ELF,CPREP,PREP,DIR_DEP,N2,NUMDEP,TOTAL_VITA
0,0,1086910,984770,54800,458270,737010,1393890,403520,38090
1,0,777250,720010,29210,355760,595250,1416310,465680,13090
2,0,451080,416800,16870,232700,318230,971370,305680,2850
3,0,298350,275970,10310,162530,204370,750610,236570,210
4,0,396630,367030,15440,223710,224300,1108230,371700,0


In [5]:
# Renaming columns.
clean_tax=ind_tax.rename(columns={'zipcode':'ZIP', 'N1':'Total_Returns', 'ELF':'Electronically_Filed',
                                 'CPREP':'Computer_Prepared_Paper', 'PREP':'Paid_Preparer', 'DIR_DEP':'Direct_Deposit',
                                 'N2':'Total_Persons', 'NUMDEP':'Total_Dependents', 'TOTAL_VITA':'Volunteer_Prepared'})
clean_tax.head()

Unnamed: 0,ZIP,Total_Returns,Electronically_Filed,Computer_Prepared_Paper,Paid_Preparer,Direct_Deposit,Total_Persons,Total_Dependents,Volunteer_Prepared
0,0,1086910,984770,54800,458270,737010,1393890,403520,38090
1,0,777250,720010,29210,355760,595250,1416310,465680,13090
2,0,451080,416800,16870,232700,318230,971370,305680,2850
3,0,298350,275970,10310,162530,204370,750610,236570,210
4,0,396630,367030,15440,223710,224300,1108230,371700,0


In [6]:
# Dropping rows with erronious placehoder data and grouping by ZIP.
clean_tax=clean_tax[clean_tax.ZIP !=99999]
clean_tax=clean_tax[clean_tax.ZIP !=0]
clean_tax = clean_tax.groupby(['ZIP']).sum()
clean_tax = clean_tax.reset_index()
clean_tax

Unnamed: 0,ZIP,Total_Returns,Electronically_Filed,Computer_Prepared_Paper,Paid_Preparer,Direct_Deposit,Total_Persons,Total_Dependents,Volunteer_Prepared
0,46001,5010,4690,170,2590,3630,9570,2850,0
1,46011,8150,7390,400,3690,5390,15560,4610,20
2,46012,8910,8080,370,3770,6240,15970,4160,80
3,46013,9010,8270,390,3690,6500,15810,4460,80
4,46016,6840,6200,300,2670,5290,12990,5120,60
...,...,...,...,...,...,...,...,...,...
669,47991,480,440,30,290,320,960,280,0
670,47992,730,650,30,350,460,1390,400,0
671,47993,1840,1710,70,1070,1170,3650,1100,0
672,47994,290,250,0,180,170,550,160,0


# Tax Data Normalization

In [7]:
# Normalizing and uploading to PGAdmin
total_returns = clean_tax[['ZIP', 'Total_Returns']]
total_returns.to_sql(name='total_returns', con=engine, if_exists="append", index=False)
total_returns

Unnamed: 0,ZIP,Total_Returns
0,46001,5010
1,46011,8150
2,46012,8910
3,46013,9010
4,46016,6840
...,...,...
669,47991,480
670,47992,730
671,47993,1840
672,47994,290


In [8]:
# Normalizing and uploading to PGAdmin
elf = clean_tax[['ZIP', 'Electronically_Filed']]
elf.to_sql(name='electronically_filed', con=engine, if_exists="append", index=False)
elf

Unnamed: 0,ZIP,Electronically_Filed
0,46001,4690
1,46011,7390
2,46012,8080
3,46013,8270
4,46016,6200
...,...,...
669,47991,440
670,47992,650
671,47993,1710
672,47994,250


In [9]:
# Normalizing and uploading to PGAdmin
cpp = clean_tax[['ZIP', 'Computer_Prepared_Paper']]
cpp.to_sql(name='computer_prepared_paper', con=engine, if_exists="append", index=False)
cpp

Unnamed: 0,ZIP,Computer_Prepared_Paper
0,46001,170
1,46011,400
2,46012,370
3,46013,390
4,46016,300
...,...,...
669,47991,30
670,47992,30
671,47993,70
672,47994,0


In [10]:
# Normalizing and uploading to PGAdmin
pp = clean_tax[['ZIP', 'Paid_Preparer']]
pp.to_sql(name='paid_preparer', con=engine, if_exists="append", index=False)
pp

Unnamed: 0,ZIP,Paid_Preparer
0,46001,2590
1,46011,3690
2,46012,3770
3,46013,3690
4,46016,2670
...,...,...
669,47991,290
670,47992,350
671,47993,1070
672,47994,180


In [11]:
# Normalizing and uploading to PGAdmin
dd = clean_tax[['ZIP', 'Direct_Deposit']]
dd.to_sql(name='direct_deposit', con=engine, if_exists="append", index=False)
dd

Unnamed: 0,ZIP,Direct_Deposit
0,46001,3630
1,46011,5390
2,46012,6240
3,46013,6500
4,46016,5290
...,...,...
669,47991,320
670,47992,460
671,47993,1170
672,47994,170


In [12]:
# Normalizing and uploading to PGAdmin
tp = clean_tax[['ZIP', 'Total_Persons']]
tp.to_sql(name='total_persons', con=engine, if_exists="append", index=False)
tp

Unnamed: 0,ZIP,Total_Persons
0,46001,9570
1,46011,15560
2,46012,15970
3,46013,15810
4,46016,12990
...,...,...
669,47991,960
670,47992,1390
671,47993,3650
672,47994,550


In [13]:
# Normalizing and uploading to PGAdmin
td = clean_tax[['ZIP', 'Total_Dependents']]
td.to_sql(name='total_dependents', con=engine, if_exists="append", index=False)
td

Unnamed: 0,ZIP,Total_Dependents
0,46001,2850
1,46011,4610
2,46012,4160
3,46013,4460
4,46016,5120
...,...,...
669,47991,280
670,47992,400
671,47993,1100
672,47994,160


In [14]:
# Normalizing and uploading to PGAdmin
vp = clean_tax[['ZIP', 'Volunteer_Prepared']]
vp.to_sql(name='volunteer_prepared', con=engine, if_exists="append", index=False)
vp

Unnamed: 0,ZIP,Volunteer_Prepared
0,46001,0
1,46011,20
2,46012,80
3,46013,80
4,46016,60
...,...,...
669,47991,0
670,47992,0
671,47993,0
672,47994,0


# K12 Data Normalization

In [15]:
# Importing clean K12 csv data
k12 = 'Indiana_k12.csv'
k12_data = pd.read_csv(k12)
k12_data

Unnamed: 0.1,Unnamed: 0,NAME,ADDRESS,CITY,STATE,ZIP,LEVEL,ENROLLMENT,START_GRADE,END_GRADE,DISTRICT_ID
0,0,LOGANSPORT JUVENILE COR FAC,LOGANSPORT ST HOSP-WEST SIDE,LOGANSPORT,IN,46947,3,122,07,12,1800006
1,1,NORTHEASTERN ELEMENTARY SCH,534 W WALLACE RD,FOUNTAIN CITY,IN,47341,1,551,PK,05,1808190
2,2,HAZEL DELL ELEMENTARY SCHOOL,3025 WESTFIELD RD,NOBLESVILLE,IN,46062,1,702,PK,05,1807650
3,3,BROWNSBURG WEST MIDDLE SCHOOL,1555 S ODELL ST,BROWNSBURG,IN,46112,2,914,06,08,1801020
4,4,FRANKLIN ELEMENTARY SCHOOL,410 W MIAMI AVE,LOGANSPORT,IN,46947,1,398,KG,05,1806030
...,...,...,...,...,...,...,...,...,...,...,...
1868,1868,BEN DAVIS UNIVERSITY HIGH SCHOOL,1155 S HIGH SCHOOL RD,INDIANAPOLIS,IN,46241,3,358,10,12,1812810
1869,1869,MADISON CONSOLIDATED JR HIGH SCH,701 8TH ST,MADISON,IN,47250,2,654,06,08,1806120
1870,1870,HAUBSTADT COMMUNITY SCHOOL,158 E 1025 S,HAUBSTADT,IN,47639,1,340,KG,08,1810350
1871,1871,THORPE CREEK ELEMENTARY,14642 E 126TH ST,FISHERS,IN,46038,1,893,KG,04,1810650


In [16]:
# Normalizing and uploading to PGAdmin
enrollment_count = k12_data[['ZIP', 'ENROLLMENT']]
enrollment_count = enrollment_count.groupby(['ZIP']).sum()
enrollment_count = enrollment_count.drop([47989], axis=0)
enrollment_count = enrollment_count.reset_index()
enrollment_count.to_sql(name='k12_enrollment_count', con=engine, if_exists="append", index=False)
enrollment_count

Unnamed: 0,ZIP,ENROLLMENT
0,46001,1582
1,46011,942
2,46012,2761
3,46013,2721
4,46016,1202
...,...,...
511,47978,1693
512,47987,1121
513,47991,862
514,47993,168


In [17]:
# Normalizing and uploading to PGAdmin
school_count = k12_data[['ZIP']]
school_count = school_count.value_counts()
school_count = pd.DataFrame(school_count)
school_count = school_count.rename(columns = {0 : "Count"})
school_count = school_count.reset_index()
school_count.to_sql(name='k12_schools_zip_count', con=engine, if_exists="append", index=False)
school_count

Unnamed: 0,ZIP,Count
0,46224,17
1,46383,15
2,47201,14
3,46901,13
4,46123,13
...,...,...
512,46348,1
513,47351,1
514,46346,1
515,46785,1


# Census Data Normalization

In [18]:
# Importing and cleaning census csv data
census = 'census_data_indiana_2014.csv'
census_csv = pd.read_csv(census)
census_csv = census_csv.rename(columns = {'Zipcode':'ZIP', 'Household Income':'Household_Income', 
                                          'Per Capita Income':'Per_Capita_Income', 'Poverty Rate':'Poverty_Rate'})
census_csv

Unnamed: 0,ZIP,Population,Median Age,Household_Income,Per_Capita_Income,Poverty Count,Poverty_Rate
0,46511,4254.0,46.9,52570.0,32929.0,348.0,8.180536
1,46526,32513.0,36.0,54187.0,24312.0,4495.0,13.825239
2,46528,27165.0,32.6,63537.0,26499.0,2439.0,8.978465
3,46544,31359.0,38.0,47778.0,25268.0,4324.0,13.788705
4,46553,3212.0,36.7,62829.0,24039.0,78.0,2.428394
...,...,...,...,...,...,...,...
770,47166,1540.0,44.1,44159.0,24089.0,308.0,20.000000
771,47224,642.0,37.4,45136.0,15881.0,162.0,25.233645
772,47244,737.0,39.3,49659.0,23775.0,157.0,21.302578
773,47274,30970.0,38.1,49644.0,24801.0,4684.0,15.124314


In [19]:
# Normalizing and uploading to PGAdmin
census_pop = census_csv[['ZIP', 'Population']]
census_pop.to_sql(name='census_population', con=engine, if_exists="append", index=False)
census_pop

Unnamed: 0,ZIP,Population
0,46511,4254.0
1,46526,32513.0
2,46528,27165.0
3,46544,31359.0
4,46553,3212.0
...,...,...
770,47166,1540.0
771,47224,642.0
772,47244,737.0
773,47274,30970.0


In [20]:
# Normalizing and uploading to PGAdmin
census_hincome = census_csv[['ZIP', 'Household_Income']]
census_hincome.to_sql(name='census_household_income', con=engine, if_exists="append", index=False)
census_hincome

Unnamed: 0,ZIP,Household_Income
0,46511,52570.0
1,46526,54187.0
2,46528,63537.0
3,46544,47778.0
4,46553,62829.0
...,...,...
770,47166,44159.0
771,47224,45136.0
772,47244,49659.0
773,47274,49644.0


In [21]:
# Normalizing and uploading to PGAdmin
census_pincome = census_csv[['ZIP', 'Per_Capita_Income']]
census_pincome.to_sql(name='census_per_capita_income', con=engine, if_exists="append", index=False)
census_pincome

Unnamed: 0,ZIP,Per_Capita_Income
0,46511,32929.0
1,46526,24312.0
2,46528,26499.0
3,46544,25268.0
4,46553,24039.0
...,...,...
770,47166,24089.0
771,47224,15881.0
772,47244,23775.0
773,47274,24801.0


In [22]:
# Normalizing and uploading to PGAdmin
census_poverty = census_csv[['ZIP', 'Poverty_Rate']]
census_poverty.to_sql(name='census_poverty_rate', con=engine, if_exists="append", index=False)
census_poverty

Unnamed: 0,ZIP,Poverty_Rate
0,46511,8.180536
1,46526,13.825239
2,46528,8.978465
3,46544,13.788705
4,46553,2.428394
...,...,...
770,47166,20.000000
771,47224,25.233645
772,47244,21.302578
773,47274,15.124314


# College Data Normalization

In [23]:
# Importing college csv data
college = 'colleges_unis_with_zips.csv'
college_df = pd.read_csv(college)
college_df.head()

Unnamed: 0,School,City,State,Lat,Lng,place_id,Zip Code
0,ANCILLA COLLEGE,DONALDSON,IN,41.364484,-86.444133,ChIJNykO2EVaEYgR6iuHCTAnQGc,46563
1,ANDERSON UNIVERSITY,ANDERSON,IN,40.10532,-85.680254,ChIJvRvrHQzZFIgRaG89vy-ipXg,46012
2,BALL STATE UNIVERSITY,MUNCIE,IN,40.193377,-85.38636,ChIJIR58aWY9FYgR9ImfGJvu4OQ,47306
3,BETHEL COLLEGE,MISHAWAKA,IN,41.661993,-86.158616,ChIJf2dC58HNFogRb1W460Xop8g,46545
4,BUTLER UNIVERSITY,INDIANAPOLIS,IN,39.768403,-86.158068,ChIJr8OliPpTa4gRPkUtyy7TxQM,46208


In [24]:
# Normalizing and uploading to PGAdmin
schools_by_zip = college_df["Zip Code"].value_counts()
schools_by_zip_df = pd.DataFrame(schools_by_zip)
schools_by_zip_df = schools_by_zip_df.reset_index()
schools_by_zip_df = schools_by_zip_df.rename(columns = {'index':'ZIP', 'Zip Code':'Count'})
schools_by_zip_df.to_sql(name='college_count', con=engine, if_exists="append", index=False)
schools_by_zip_df

Unnamed: 0,ZIP,Count
0,46805,4
1,46556,3
2,46208,2
3,47374,2
4,47710,2
5,46590,2
6,46989,1
7,46307,1
8,46545,1
9,46803,1
