In [1]:
import pandas as pd 
from sqlalchemy import create_engine

In [2]:
rds_connection_string = "postgres:Tenors2015@localhost:5432/education_taxes_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [3]:
# Importing tax data.
csv = 'tax_data.csv'
dirty_taxes = pd.read_csv(csv)
dirty_taxes.head()

Unnamed: 0,STATEFIPS,STATE,zipcode,agi_stub,N1,ELF,CPREP,PREP,DIR_DEP,N2,NUMDEP,TOTAL_VITA,VITA,TCE,VITA_EIC
0,1,AL,0,1,768120,696930,37470,399160,559820,1180240,439980,24700,16610,8100,5300
1,1,AL,0,2,503430,457510,23180,266880,371440,977450,348420,12230,8350,3880,350
2,1,AL,0,3,274590,248630,13210,157800,179490,587740,182370,3110,1590,1520,0
3,1,AL,0,4,174830,159190,6830,102340,110320,429360,131170,990,510,480,0
4,1,AL,0,5,245150,224280,10500,145010,123560,665630,213100,770,730,40,0


In [4]:
# Filtering the data to show only IN values & dropping unneeded columns.
ind_tax=dirty_taxes.loc[dirty_taxes["STATE"]=='IN']
ind_tax = ind_tax.drop(['STATEFIPS', 'STATE', 'VITA', 'TCE', 'VITA_EIC'], axis=1)
ind_tax.reset_index(drop=True, inplace=True)
ind_tax.head()

Unnamed: 0,zipcode,agi_stub,N1,ELF,CPREP,PREP,DIR_DEP,N2,NUMDEP,TOTAL_VITA
0,0,1,1086910,984770,54800,458270,737010,1393890,403520,38090
1,0,2,777250,720010,29210,355760,595250,1416310,465680,13090
2,0,3,451080,416800,16870,232700,318230,971370,305680,2850
3,0,4,298350,275970,10310,162530,204370,750610,236570,210
4,0,5,396630,367030,15440,223710,224300,1108230,371700,0


In [5]:
# Renaming columns.
clean_tax=ind_tax.rename(columns={'zipcode':'ZIP', 'agi_stub':'AGI', 'N1':'Total_Returns', 'ELF':'Electronically_Filed',
                                 'CPREP':'Computer_Prepared_Paper', 'PREP':'Paid_Preparer', 'DIR_DEP':'Direct_Deposit',
                                 'N2':'Total_Persons', 'NUMDEP':'Total_Dependents', 'TOTAL_VITA':'Volunteer_Prepared'})
clean_tax.head()

Unnamed: 0,ZIP,AGI,Total_Returns,Electronically_Filed,Computer_Prepared_Paper,Paid_Preparer,Direct_Deposit,Total_Persons,Total_Dependents,Volunteer_Prepared
0,0,1,1086910,984770,54800,458270,737010,1393890,403520,38090
1,0,2,777250,720010,29210,355760,595250,1416310,465680,13090
2,0,3,451080,416800,16870,232700,318230,971370,305680,2850
3,0,4,298350,275970,10310,162530,204370,750610,236570,210
4,0,5,396630,367030,15440,223710,224300,1108230,371700,0


In [6]:
# Dropping rows with erronious placehoder data.
clean_tax=clean_tax[clean_tax.ZIP !=99999]
clean_tax.head()

Unnamed: 0,ZIP,AGI,Total_Returns,Electronically_Filed,Computer_Prepared_Paper,Paid_Preparer,Direct_Deposit,Total_Persons,Total_Dependents,Volunteer_Prepared
0,0,1,1086910,984770,54800,458270,737010,1393890,403520,38090
1,0,2,777250,720010,29210,355760,595250,1416310,465680,13090
2,0,3,451080,416800,16870,232700,318230,971370,305680,2850
3,0,4,298350,275970,10310,162530,204370,750610,236570,210
4,0,5,396630,367030,15440,223710,224300,1108230,371700,0


In [7]:
# Beginning to normalize data.
total_returns = clean_tax[['ZIP', 'AGI', 'Total_Returns']]
total_returns.to_sql(name='total_returns', con=engine, if_exists="append", index=False)
total_returns

Unnamed: 0,ZIP,AGI,Total_Returns
0,0,1,1086910
1,0,2,777250
2,0,3,451080
3,0,4,298350
4,0,5,396630
...,...,...,...
4045,47995,2,190
4046,47995,3,140
4047,47995,4,80
4048,47995,5,90


In [8]:
elf = clean_tax[['ZIP', 'AGI', 'Electronically_Filed']]
elf.to_sql(name='electronically_filed', con=engine, if_exists="append", index=False)
elf

Unnamed: 0,ZIP,AGI,Electronically_Filed
0,0,1,984770
1,0,2,720010
2,0,3,416800
3,0,4,275970
4,0,5,367030
...,...,...,...
4045,47995,2,180
4046,47995,3,130
4047,47995,4,80
4048,47995,5,90


In [9]:
cpp = clean_tax[['ZIP', 'AGI', 'Computer_Prepared_Paper']]
cpp.to_sql(name='computer_prepared_paper', con=engine, if_exists="append", index=False)
cpp

Unnamed: 0,ZIP,AGI,Computer_Prepared_Paper
0,0,1,54800
1,0,2,29210
2,0,3,16870
3,0,4,10310
4,0,5,15440
...,...,...,...
4045,47995,2,0
4046,47995,3,0
4047,47995,4,0
4048,47995,5,0


In [10]:
pp = clean_tax[['ZIP', 'AGI', 'Paid_Preparer']]
pp.to_sql(name='paid_preparer', con=engine, if_exists="append", index=False)
pp

Unnamed: 0,ZIP,AGI,Paid_Preparer
0,0,1,458270
1,0,2,355760
2,0,3,232700
3,0,4,162530
4,0,5,223710
...,...,...,...
4045,47995,2,100
4046,47995,3,80
4047,47995,4,60
4048,47995,5,70


In [11]:
dd = clean_tax[['ZIP', 'AGI', 'Direct_Deposit']]
dd.to_sql(name='direct_deposit', con=engine, if_exists="append", index=False)
dd

Unnamed: 0,ZIP,AGI,Direct_Deposit
0,0,1,737010
1,0,2,595250
2,0,3,318230
3,0,4,204370
4,0,5,224300
...,...,...,...
4045,47995,2,130
4046,47995,3,100
4047,47995,4,50
4048,47995,5,60


In [12]:
tp = clean_tax[['ZIP', 'AGI', 'Total_Persons']]
tp.to_sql(name='total_persons', con=engine, if_exists="append", index=False)
tp

Unnamed: 0,ZIP,AGI,Total_Persons
0,0,1,1393890
1,0,2,1416310
2,0,3,971370
3,0,4,750610
4,0,5,1108230
...,...,...,...
4045,47995,2,350
4046,47995,3,320
4047,47995,4,230
4048,47995,5,280


In [13]:
td = clean_tax[['ZIP', 'AGI', 'Total_Dependents']]
td.to_sql(name='total_dependents', con=engine, if_exists="append", index=False)
td

Unnamed: 0,ZIP,AGI,Total_Dependents
0,0,1,403520
1,0,2,465680
2,0,3,305680
3,0,4,236570
4,0,5,371700
...,...,...,...
4045,47995,2,110
4046,47995,3,110
4047,47995,4,80
4048,47995,5,120


In [14]:
vp = clean_tax[['ZIP', 'AGI', 'Volunteer_Prepared']]
vp.to_sql(name='volunteer_prepared', con=engine, if_exists="append", index=False)
vp

Unnamed: 0,ZIP,AGI,Volunteer_Prepared
0,0,1,38090
1,0,2,13090
2,0,3,2850
3,0,4,210
4,0,5,0
...,...,...,...
4045,47995,2,0
4046,47995,3,0
4047,47995,4,0
4048,47995,5,0
