In [1]:
import pandas as pd
from plotnine import *

# importing statsmodels (for linear regression in python)
import statsmodels.formula.api as smf

from datetime import date
today = date.today()
today = pd.to_datetime(today)
import numpy as np
import re



In [2]:
pd.set_option("display.max_columns",None)

## Load Data

#### Michigan air polluter documents & permit data
These are violation notices, activity reports (inspection), compliance evaluations etc...

In [3]:
doc_df = pd.read_csv("EGLE-AQD-documents.csv")

In [4]:
doc_df.head()

Unnamed: 0,source_id,doc_type,date,doc_url
0,N8277,ACO,20151217,https://www.egle.state.mi.us/aps/downloads/srn...
1,N8274,ACO,20151217,https://www.egle.state.mi.us/aps/downloads/srn...
2,U04060035,SAR,20190716,https://www.egle.state.mi.us/aps/downloads/srn...
3,U04060035,SAR,20200507,https://www.egle.state.mi.us/aps/downloads/srn...
4,N7824,FCE,20160525,https://www.egle.state.mi.us/aps/downloads/srn...


In [5]:
pti_df = pd.read_csv("ptis-address-clean-042722.csv")
pti_df.head()

Unnamed: 0,company,srn,address,city,zip_code,county,pti_no.,approved,notes,flag,full_address
0,MENASHA CORPORATION,A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,152-96B,4/12/01,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078"
1,MENASHA CORPORATION,A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,222-94A,10/18/02,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078"
2,"OTSEGO PAPER, INC",A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,60-03A,5/14/07,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078"
3,"OTSEGO PAPER, INC",A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,193-19A,4/22/21,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078"
4,"SODUS HARD CHROME, INC",A0083,3085 YORE AVENUE,SODUS,49126,BERRIEN,2-00A,1/14/15,,0,"3085 YORE AVENUE, SODUS, MI 49126"


In [6]:
# Looking for sources that have "Rolled into ROP" in the notes, because those are Major Sources
major_sources = pti_df.query('notes.str.contains("ROP",na=False)')

In [7]:
major_sources.approved = pd.to_datetime(major_sources.approved,format="%m/%d/%y")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
major_sources.head()

Unnamed: 0,company,srn,address,city,zip_code,county,pti_no.,approved,notes,flag,full_address
0,MENASHA CORPORATION,A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,152-96B,2001-04-12,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078"
1,MENASHA CORPORATION,A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,222-94A,2002-10-18,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078"
2,"OTSEGO PAPER, INC",A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,60-03A,2007-05-14,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078"
3,"OTSEGO PAPER, INC",A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,193-19A,2021-04-22,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078"
32,ESCANABA PAPER COMPANY,A0884,"SEC 1, T39N, R23W",WELLS TOWNSHIP,49829,DELTA,16-70N,2001-03-16,Rolled into ROP,0,"SEC 1, T39N, R23W, WELLS TOWNSHIP, MI 49829"


In [9]:
major_sources.zip_code = major_sources.zip_code.astype("str").str[:5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
# Filtering out duplicate records for sources, unless that source has a building in a different zip code.
# Saving the oldest permit on file
major_sources = major_sources.sort_values(['srn','approved']).drop_duplicates(subset=['srn','zip_code'])

In [14]:
fixed_zips = {
    "49005":"49006",
    "48674":"48640",
    "48686":"48640",
    "48121":"48120",
    "48901":"48910",
    "48795":"48759",
    "48824":"48910",
    "48090":"48092",
    "48859":"48858",
    "49937":"49337",
    "49041":"49048",
    "49152":"49512"
    
}

In [15]:
major_sources = major_sources.replace({"zip_code": fixed_zips})

In [11]:
# List of major sources
major_srn = major_sources.srn.to_list()

In [12]:
# Filter doc df to look at only facilities that are major polluters
major_doc_df = doc_df[doc_df.source_id.isin(major_srn)]
major_doc_df

Unnamed: 0,source_id,doc_type,date,doc_url
96,B6636,FCE,20200928,https://www.egle.state.mi.us/aps/downloads/srn...
97,B6636,RVN,20191003,https://www.egle.state.mi.us/aps/downloads/srn...
98,B6636,RVN,20200831,https://www.egle.state.mi.us/aps/downloads/srn...
99,B6636,SAR,20200130,https://www.egle.state.mi.us/aps/downloads/srn...
100,B6636,TEST,20130924,https://www.egle.state.mi.us/aps/downloads/srn...
...,...,...,...,...
18582,A0023,TEST,20220325,https://www.egle.state.mi.us/aps/downloads/srn...
18583,B2796,TEST,20220317,https://www.egle.state.mi.us/aps/downloads/srn...
18584,N2901,FCE,20220113,https://www.egle.state.mi.us/aps/downloads/srn...
18585,N2901,SAR,20220113,https://www.egle.state.mi.us/aps/downloads/srn...


In [18]:
major_sources['oldest_permit_age'] = today - major_sources.approved
major_sources['oldest_permit_age'] = (major_sources.oldest_permit_age/np.timedelta64(1, 'Y')).round(2)

In [19]:
major_sources

Unnamed: 0,company,srn,address,city,zip_code,county,pti_no.,approved,notes,flag,full_address,oldest_permit_age
0,MENASHA CORPORATION,A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,152-96B,2001-04-12,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078",21.05
43,MEAD PAPER COMPANY,A0884,7100 COUNTY ROAD 426,ESCANABA,49829,DELTA,800-88A,1998-04-23,Rolled into ROP,0,"7100 COUNTY ROAD 426, ESCANABA, MI 49829",24.02
70,GENERAL MOTORS CORPORATION,A1641,920 TOWNSEND STREET,LANSING,48933,INGHAM,69-84A,2000-12-19,Rolled into ROP,0,"920 TOWNSEND STREET, LANSING, MI 48933",21.36
81,"KALSEC, INC",A1991,3713 W MAIN STREET,KALAMAZOO,49006,KALAMAZOO,104-95A,1997-01-16,Rolled into ROP,0,"3713 W MAIN STREET, KALAMAZOO, MI 49005",25.28
126,E I DUPONT DE NEMOURS &,A3569,400 GROESBECK HIGHWAY,MOUNT CLEMENS,48043,MACOMB,646-85A,1997-01-22,Rolled into ROP,0,"400 GROESBECK HIGHWAY, MOUNT CLEMENS, MI 48043",25.27
...,...,...,...,...,...,...,...,...,...,...,...,...
4621,KAWASAKI MOTORS CORP USA,P0677,5080 36TH STREET SE,GRAND RAPIDS,49512,KENT,230-15,2016-03-10,Rolled into ROP,0,"5080 36TH STREET SE, GRAND RAPIDS, MI 49152",6.14
4648,TOEFCO ENGINEERED COATING,P0708,1919 INDUSTRIAL DRIVE,NILES,49120,BERRIEN,86-16,2016-07-22,Rolled into ROP,0,"1919 INDUSTRIAL DRIVE, NILES, MI 49120",5.77
4816,"TRINSEO, LLC",P1025,433W BUILDING,MIDLAND,48667,MIDLAND,33-16,2016-04-14,Rolled into ROP,0,"433W BUILDING, MIDLAND, MI 48667",6.04
4817,DDP SPECIALTY ELECTRONIC,P1027,3400 S SAGINAW ROAD,MIDLAND,48667,MIDLAND,167-19,2020-01-27,Rolled into ROP,0,"3400 S SAGINAW ROAD, MIDLAND, MI 48667",2.26


In [20]:
# I only have 9 years worth of data so if a permit is longer than 9 years 
# we are going to just say 9 to keep the rate consistent with the data
def years_convert(years):
    if years > 9:
        return 9.0
    else: 
        return years

In [21]:
major_sources['oldest_permit_age_converted'] = major_sources.oldest_permit_age.apply(years_convert)

In [22]:
major_sources

Unnamed: 0,company,srn,address,city,zip_code,county,pti_no.,approved,notes,flag,full_address,oldest_permit_age,oldest_permit_age_converted
0,MENASHA CORPORATION,A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,152-96B,2001-04-12,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078",21.05,9.00
43,MEAD PAPER COMPANY,A0884,7100 COUNTY ROAD 426,ESCANABA,49829,DELTA,800-88A,1998-04-23,Rolled into ROP,0,"7100 COUNTY ROAD 426, ESCANABA, MI 49829",24.02,9.00
70,GENERAL MOTORS CORPORATION,A1641,920 TOWNSEND STREET,LANSING,48933,INGHAM,69-84A,2000-12-19,Rolled into ROP,0,"920 TOWNSEND STREET, LANSING, MI 48933",21.36,9.00
81,"KALSEC, INC",A1991,3713 W MAIN STREET,KALAMAZOO,49006,KALAMAZOO,104-95A,1997-01-16,Rolled into ROP,0,"3713 W MAIN STREET, KALAMAZOO, MI 49005",25.28,9.00
126,E I DUPONT DE NEMOURS &,A3569,400 GROESBECK HIGHWAY,MOUNT CLEMENS,48043,MACOMB,646-85A,1997-01-22,Rolled into ROP,0,"400 GROESBECK HIGHWAY, MOUNT CLEMENS, MI 48043",25.27,9.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4621,KAWASAKI MOTORS CORP USA,P0677,5080 36TH STREET SE,GRAND RAPIDS,49512,KENT,230-15,2016-03-10,Rolled into ROP,0,"5080 36TH STREET SE, GRAND RAPIDS, MI 49152",6.14,6.14
4648,TOEFCO ENGINEERED COATING,P0708,1919 INDUSTRIAL DRIVE,NILES,49120,BERRIEN,86-16,2016-07-22,Rolled into ROP,0,"1919 INDUSTRIAL DRIVE, NILES, MI 49120",5.77,5.77
4816,"TRINSEO, LLC",P1025,433W BUILDING,MIDLAND,48667,MIDLAND,33-16,2016-04-14,Rolled into ROP,0,"433W BUILDING, MIDLAND, MI 48667",6.04,6.04
4817,DDP SPECIALTY ELECTRONIC,P1027,3400 S SAGINAW ROAD,MIDLAND,48667,MIDLAND,167-19,2020-01-27,Rolled into ROP,0,"3400 S SAGINAW ROAD, MIDLAND, MI 48667",2.26,2.26


In [13]:
major_doc_df.doc_type = major_doc_df.doc_type.str.replace("FCE1","FCE").str.replace("FCE2","FCE")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
def sar_clean(doc_type):
    query = re.findall(r"SAR",doc_type)
    if len(query) > 0:
        return "SAR"
    else:
        return doc_type

In [15]:
def vn_clean(doc_type):
    query = re.findall(r"^VN",doc_type)
    if len(query) > 0:
        return "VN"
    else:
        return doc_type

In [16]:
major_doc_df.doc_type = major_doc_df.doc_type.astype("str")

In [17]:
major_doc_df.doc_type = major_doc_df.doc_type.apply(sar_clean)

In [18]:
major_doc_df.doc_type = major_doc_df.doc_type.apply(vn_clean)

In [20]:
major_doc_df.date = pd.to_datetime(doc_df.date, format="%Y%m%d", errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [39]:
major_doc_df

Unnamed: 0,source_id,doc_type,date,doc_url
96,B6636,FCE,2020-09-28,https://www.egle.state.mi.us/aps/downloads/srn...
97,B6636,RVN,2019-10-03,https://www.egle.state.mi.us/aps/downloads/srn...
98,B6636,RVN,2020-08-31,https://www.egle.state.mi.us/aps/downloads/srn...
99,B6636,SAR,2020-01-30,https://www.egle.state.mi.us/aps/downloads/srn...
100,B6636,TEST,2013-09-24,https://www.egle.state.mi.us/aps/downloads/srn...
...,...,...,...,...
18582,A0023,TEST,2022-03-25,https://www.egle.state.mi.us/aps/downloads/srn...
18583,B2796,TEST,2022-03-17,https://www.egle.state.mi.us/aps/downloads/srn...
18584,N2901,FCE,2022-01-13,https://www.egle.state.mi.us/aps/downloads/srn...
18585,N2901,SAR,2022-01-13,https://www.egle.state.mi.us/aps/downloads/srn...


In [37]:
most_recent_docs = major_doc_df.sort_values(['source_id','doc_type','date'],ascending=False) \
    .drop_duplicates(subset=['source_id','doc_type'],keep='first')

In [50]:
# Filtering to just the docs I want
doc_list = ['VN','SAR','FCE']
most_recent_docs = most_recent_docs[most_recent_docs.doc_type.isin(doc_list)]

In [54]:

pd.pivot_table(most_recent_docs, index="source_id",columns=["doc_type"],values='date')\
    .reset_index().to_csv('major-source-recent-docs.csv',index=False)

In [51]:
major_doc_df.doc_type.value_counts()

TEST      1737
SAR       1247
FCE        944
VN         676
RVN        500
ACO         59
ENFN        44
TEST2       33
TEST1       30
test        22
STIP         9
RVN2         4
RVN1         4
SEM          2
TEST01       2
TEST02       2
AFO          1
CJ1          1
CJ2          1
RVN3         1
AQD          1
Name: doc_type, dtype: int64

In [25]:
fce_count = pd.crosstab(major_doc_df.source_id, major_doc_df.doc_type)[['FCE']].reset_index()
major_sources = major_sources.merge(fce_count, how="outer",right_on="source_id",left_on="srn")

In [52]:
vn_count = pd.crosstab(major_doc_df.source_id, major_doc_df.doc_type)[['VN']].reset_index()
major_sources = major_sources.merge(vn_count, how="outer",right_on="source_id",left_on="srn")

In [53]:
sar_count = pd.crosstab(major_doc_df.source_id, major_doc_df.doc_type)[['SAR']].reset_index()
major_sources = major_sources.merge(sar_count, how="outer",right_on="source_id",left_on="srn")

In [54]:
major_sources

Unnamed: 0,company,srn,address,city,zip_code,county,pti_no.,approved,notes,flag,full_address,oldest_permit_age,oldest_permit_age_converted,source_id_x,FCE,source_id_y,VN,source_id,SAR
0,MENASHA CORPORATION,A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,152-96B,2001-04-12,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078",21.05,9.00,A0023,2.0,A0023,1.0,A0023,4.0
1,MEAD PAPER COMPANY,A0884,7100 COUNTY ROAD 426,ESCANABA,49829,DELTA,800-88A,1998-04-23,Rolled into ROP,0,"7100 COUNTY ROAD 426, ESCANABA, MI 49829",24.02,9.00,A0884,3.0,A0884,4.0,A0884,3.0
2,GENERAL MOTORS CORPORATION,A1641,920 TOWNSEND STREET,LANSING,48933,INGHAM,69-84A,2000-12-19,Rolled into ROP,0,"920 TOWNSEND STREET, LANSING, MI 48933",21.36,9.00,A1641,4.0,A1641,0.0,A1641,5.0
3,"KALSEC, INC",A1991,3713 W MAIN STREET,KALAMAZOO,49006,KALAMAZOO,104-95A,1997-01-16,Rolled into ROP,0,"3713 W MAIN STREET, KALAMAZOO, MI 49005",25.28,9.00,A1991,3.0,A1991,1.0,A1991,3.0
4,E I DUPONT DE NEMOURS &,A3569,400 GROESBECK HIGHWAY,MOUNT CLEMENS,48043,MACOMB,646-85A,1997-01-22,Rolled into ROP,0,"400 GROESBECK HIGHWAY, MOUNT CLEMENS, MI 48043",25.27,9.00,A3569,4.0,A3569,2.0,A3569,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,KAWASAKI MOTORS CORP USA,P0677,5080 36TH STREET SE,GRAND RAPIDS,49512,KENT,230-15,2016-03-10,Rolled into ROP,0,"5080 36TH STREET SE, GRAND RAPIDS, MI 49152",6.14,6.14,P0677,3.0,P0677,4.0,P0677,3.0
281,TOEFCO ENGINEERED COATING,P0708,1919 INDUSTRIAL DRIVE,NILES,49120,BERRIEN,86-16,2016-07-22,Rolled into ROP,0,"1919 INDUSTRIAL DRIVE, NILES, MI 49120",5.77,5.77,P0708,2.0,P0708,1.0,P0708,2.0
282,"TRINSEO, LLC",P1025,433W BUILDING,MIDLAND,48667,MIDLAND,33-16,2016-04-14,Rolled into ROP,0,"433W BUILDING, MIDLAND, MI 48667",6.04,6.04,,,,,,
283,DDP SPECIALTY ELECTRONIC,P1027,3400 S SAGINAW ROAD,MIDLAND,48667,MIDLAND,167-19,2020-01-27,Rolled into ROP,0,"3400 S SAGINAW ROAD, MIDLAND, MI 48667",2.26,2.26,P1027,0.0,P1027,2.0,P1027,4.0


In [55]:
# No documents for 5 companies. HMMM
major_sources.query('FCE.isnull()')

Unnamed: 0,company,srn,address,city,zip_code,county,pti_no.,approved,notes,flag,full_address,oldest_permit_age,oldest_permit_age_converted,source_id_x,FCE,source_id_y,VN,source_id,SAR
282,"TRINSEO, LLC",P1025,433W BUILDING,MIDLAND,48667,MIDLAND,33-16,2016-04-14,Rolled into ROP,0,"433W BUILDING, MIDLAND, MI 48667",6.04,6.04,,,,,,


In [56]:
zip_count = major_sources.zip_code.value_counts().to_frame().reset_index().rename({'index':'zip_code','zip_code':'major_sources_in_zip'},axis=1)

In [57]:
zip_count

Unnamed: 0,zip_code,major_sources_in_zip
0,49601,7
1,48640,6
2,49646,6
3,49464,6
4,49423,5
...,...,...
182,48124,1
183,48006,1
184,48207,1
185,48441,1


In [58]:
major_sources = major_sources.merge(zip_count,how='outer',left_on='zip_code',right_on='zip_code')

In [59]:
major_sources

Unnamed: 0,company,srn,address,city,zip_code,county,pti_no.,approved,notes,flag,full_address,oldest_permit_age,oldest_permit_age_converted,source_id_x,FCE,source_id_y,VN,source_id,SAR,major_sources_in_zip
0,MENASHA CORPORATION,A0023,320 N FARMER STREET,OTSEGO,49078,ALLEGAN,152-96B,2001-04-12,Rolled into ROP,0,"320 N FARMER STREET, OTSEGO, MI 49078",21.05,9.00,A0023,2.0,A0023,1.0,A0023,4.0,1
1,MEAD PAPER COMPANY,A0884,7100 COUNTY ROAD 426,ESCANABA,49829,DELTA,800-88A,1998-04-23,Rolled into ROP,0,"7100 COUNTY ROAD 426, ESCANABA, MI 49829",24.02,9.00,A0884,3.0,A0884,4.0,A0884,3.0,2
2,DELTA SOLID WASTE,N6035,5701 19TH AVENUE NORTH,ESCANABA,49829,DELTA,30-03,2003-04-17,Rolled into ROP,0,"5701 19TH AVENUE NORTH, ESCANABA, MI 49829",19.04,9.00,N6035,2.0,N6035,0.0,N6035,4.0,2
3,GENERAL MOTORS CORPORATION,A1641,920 TOWNSEND STREET,LANSING,48933,INGHAM,69-84A,2000-12-19,Rolled into ROP,0,"920 TOWNSEND STREET, LANSING, MI 48933",21.36,9.00,A1641,4.0,A1641,0.0,A1641,5.0,1
4,"KALSEC, INC",A1991,3713 W MAIN STREET,KALAMAZOO,49006,KALAMAZOO,104-95A,1997-01-16,Rolled into ROP,0,"3713 W MAIN STREET, KALAMAZOO, MI 49005",25.28,9.00,A1991,3.0,A1991,1.0,A1991,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,"HARBOR FOAM, INC",N7754,2950 PRAIRIE STREET SW,GRANDVILLE,49418,KENT,35-07B,2012-07-30,Rolled into ROP,0,"2950 PRAIRIE STREET SW, GRANDVILLE, MI 49418",9.75,9.00,N7754,4.0,N7754,0.0,N7754,4.0,1
281,HYUNDAI-KIA AMERICA TECHNICAL,N7886,6800 GEDDES ROAD,SUPERIOR TOWNSHIP,48198,WASHTENAW,285-08C,2013-02-06,Rolled into ROP,0,"6800 GEDDES ROAD, SUPERIOR TOWNSHIP, MI 48198",9.23,9.00,N7886,4.0,N7886,0.0,N7886,4.0,1
282,VECTOR PIPELINE LP,N8151,"W NE SEC 8, T4S",ATHENS TOWNSHIP,49011,CALHOUN,288-08,2008-12-10,Rolled into ROP,0,"W NE SEC 8, T4S, ATHENS TOWNSHIP, MI 49011",13.39,9.00,N8151,4.0,N8151,0.0,N8151,5.0,1
283,"PLASAN NORTH AMERICA, INC",P0374,3195 WILSON DRIVE NW,WALKER,49534,KENT,35-16,2016-05-10,Rolled into ROP,0,"3195 WILSON DRIVE NW, WALKER, MI 49534",5.97,5.97,P0374,5.0,P0374,3.0,P0374,5.0,1


#### ACS Zip Code Poverty Level for Michigan

In [60]:
pov_df = pd.read_csv("ACS_MI_Zip_12MonthsPovertyLevelIncome_Pct.csv")

In [61]:
pov_df.head()

Unnamed: 0.1,Unnamed: 0,name,Total,"Total, Error",Income Poverty Level,"Income Poverty Level, Error"
0,0,Michigan,9741628,0.0,14.4,0.1
1,1,48001,11818,0.9,9.9,2.0
2,2,48002,3212,1.1,7.7,3.5
3,3,48003,6047,3.0,9.4,3.2
4,4,48005,5439,1.7,4.9,2.0


#### ACS Zip Code Race Demographics for Michigan


In [62]:
race_df = pd.read_csv("ACS_MI_Zip_RacePct.csv")

In [63]:
race_df.head()

Unnamed: 0,name,white_pct,black_pct,am_indian_ak_native_pct,asian_pct,hi_pc_islander_pct,other_pct
0,Michigan,78.4,13.8,0.5,3.1,0.0,4.1
1,48001,97.0,0.1,0.6,0.6,0.1,1.6
2,48002,98.7,0.0,0.0,0.0,0.0,1.3
3,48003,98.7,0.0,0.1,1.0,0.0,0.2
4,48005,97.0,0.8,0.4,0.2,0.0,1.7


## Clean data

#### Renaming columns

In [64]:
pov_df.columns = pov_df.columns.str.lower().str.replace(" ","_")

In [65]:
race_df.columns = race_df.columns.str.lower().str.replace(" ","_")

In [66]:
major_sources.columns = major_sources.columns.str.lower().str.replace(" ","_")

## Creating New Data Frames

#### Compliance Evaluations -- Zip Code & Source

### Merging Data by Source & Demographic Information

In [67]:
major_sources = major_sources.merge(pov_df, how='left',left_on='zip_code',right_on='name').drop(['unnamed:_0','total,_error','income_poverty_level,_error'],axis=1)

In [68]:
major_sources = major_sources.merge(race_df, how='left',left_on='zip_code',right_on='name').drop(['source_id','name_x','name_y'],axis=1)

In [72]:
major_sources = major_sources.drop(['source_id_x','source_id_y'],axis=1)