In [162]:
import pandas as pd
from plotnine import *

# importing statsmodels (for linear regression in python)
import statsmodels.formula.api as smf

from datetime import date
today = date.today()
today = pd.to_datetime(today)
import numpy as np

In [163]:
pd.set_option("display.max_columns",None)

## Load Data

#### Michigan air polluter documents & permit data
These are violation notices, activity reports (inspection), compliance evaluations etc...

In [164]:
doc_df = pd.read_csv("EGLE-AQD-documents.csv")

In [165]:
doc_df.head()

Unnamed: 0,source_id,doc_type,date,doc_url
0,N8277,ACO,20151217,https://www.egle.state.mi.us/aps/downloads/srn...
1,N8274,ACO,20151217,https://www.egle.state.mi.us/aps/downloads/srn...
2,U04060035,SAR,20190716,https://www.egle.state.mi.us/aps/downloads/srn...
3,U04060035,SAR,20200507,https://www.egle.state.mi.us/aps/downloads/srn...
4,N7824,FCE,20160525,https://www.egle.state.mi.us/aps/downloads/srn...


In [166]:
pti_df = pd.read_csv("pti-list-clean.csv")
pti_df.head()

Unnamed: 0,company,srn,address,city,zip_code,pti_no.,approved,notes
0,"RIVERSIDE ENERGY MICHIGAN, LLC",N8170,SECTION 32 - MOUNT MARIA CPF,ALCONA TOWNSHIP,48721.0,329-08,12/17/2008,
1,"LAMBDA ENERGY RESOURCES, LLC",N7470,"SW SW NW SEC 10, T28N - CALEDONIA",CALEDONIA TOWNSHIP,49735.0,109-05,7/6/2005,
2,TRENDWELL ENERGY CORPORATION,N7901,"NW NE SEC 20, T8N, R6E - WOLF CREEK",CALEDONIA TOWNSHIP,48762.0,349-07,1/9/2008,
3,RIVERSIDE ENERGY MICHIGAN,N8070,"NW NE SEC 20, T8N, R6E",CALEDONIA TOWNSHIP,48619.0,188-08,6/30/2008,
4,"LAMBDA ENERGY RESOURCES, LLC",N8074,"SW 1/4 SE 1/4 NE 1/4 SEC 8, T28N, R8E -",CALEDONIA TOWNSHIP,49316.0,199-08A,1/14/2011,


In [167]:
# Looking for sources that have "Rolled into ROP" in the notes, because those are Major Sources
major_sources = pti_df.query('notes.str.contains("ROP",na=False)')

In [168]:
major_sources.approved = pd.to_datetime(major_sources.approved,format="%m/%d/%Y")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [169]:
major_sources.zip_code = major_sources.zip_code.astype("str").str[:5]

In [170]:
# Filtering out duplicate records for sources, unless that source has a building in a different zip code.
# Saving the oldest permit on file
major_sources = major_sources.sort_values(['srn','approved']).drop_duplicates(subset=['srn','zip_code'])

In [171]:
fixed_zips = {
    "49005":"49006",
    "48674":"48640",
    "48686":"48640",
    "48121":"48120",
    "48901":"48910",
    "48795":"48759",
    "48824":"48910",
    "48090":"48092",
    "48859":"48858",
    "49937":"49337",
    "49041":"49048",
    "49152":"49512"
    
}

In [172]:
major_sources = major_sources.replace({"zip_code": fixed_zips})

In [173]:
# List of major sources
major_srn = major_sources.srn.to_list()

In [174]:
# Filter doc df to look at only facilities that are major polluters
major_doc_df = doc_df[doc_df.source_id.isin(major_srn)]
major_doc_df

Unnamed: 0,source_id,doc_type,date,doc_url
96,B6636,FCE,20200928,https://www.egle.state.mi.us/aps/downloads/srn...
97,B6636,RVN,20191003,https://www.egle.state.mi.us/aps/downloads/srn...
98,B6636,RVN,20200831,https://www.egle.state.mi.us/aps/downloads/srn...
99,B6636,SAR,20200130,https://www.egle.state.mi.us/aps/downloads/srn...
100,B6636,TEST,20130924,https://www.egle.state.mi.us/aps/downloads/srn...
...,...,...,...,...
18582,A0023,TEST,20220325,https://www.egle.state.mi.us/aps/downloads/srn...
18583,B2796,TEST,20220317,https://www.egle.state.mi.us/aps/downloads/srn...
18584,N2901,FCE,20220113,https://www.egle.state.mi.us/aps/downloads/srn...
18585,N2901,SAR,20220113,https://www.egle.state.mi.us/aps/downloads/srn...


In [175]:
major_sources['oldest_permit_age'] = today - major_sources.approved
major_sources['oldest_permit_age'] = (major_sources.oldest_permit_age/np.timedelta64(1, 'Y')).round(2)

In [176]:
major_sources

Unnamed: 0,company,srn,address,city,zip_code,pti_no.,approved,notes,oldest_permit_age
69,MENASHA CORPORATION,A0023,320 N FARMER STREET - PAPERBOARD,OTSEGO,49078,152-96B,2001-04-12,Rolled into ROP,21.05
734,MEAD PAPER COMPANY,A0884,7100 COUNTY ROAD 426,ESCANABA,49829,800-88A,1998-04-23,Rolled into ROP,24.02
1176,GENERAL MOTORS CORPORATION,A1641,920 TOWNSEND STREET - BOC LANSING,LANSING,48933,69-84A,2000-12-19,Rolled into ROP,21.36
1430,"KALSEC, INC",A1991,3713 W MAIN STREET,KALAMAZOO,49006,104-95A,1997-01-16,Rolled into ROP,25.28
2158,E I DUPONT DE NEMOURS & COMPANY,A3569,400 GROESBECK HIGHWAY - MOUNT,MOUNT CLEMENS,48043,646-85A,1997-01-22,Rolled into ROP,25.27
...,...,...,...,...,...,...,...,...,...
1801,KAWASAKI MOTORS CORP USA,P0677,5080 36TH STREET SE,GRAND RAPIDS,49512,230-15,2016-03-10,Rolled into ROP,6.14
310,TOEFCO ENGINEERED COATING SYSTEMS,P0708,1919 INDUSTRIAL DRIVE,NILES,49120,86-16,2016-07-22,Rolled into ROP,5.77
2729,"TRINSEO, LLC",P1025,433W BUILDING,MIDLAND,48667,33-16,2016-04-14,Rolled into ROP,6.04
2730,DDP SPECIALTY ELECTRONIC MATERIALS,P1027,3400 S SAGINAW ROAD - UNIT 96,MIDLAND,48667,167-19,2020-01-27,Rolled into ROP,2.25


In [177]:
# I only have 9 years worth of data so if a permit is longer than 9 years 
# we are going to just say 9 to keep the rate consistent with the data
def years_convert(years):
    if years > 9:
        return 9.0
    else: 
        return years

In [178]:
major_sources['oldest_permit_age_converted'] = major_sources.oldest_permit_age.apply(years_convert)

In [179]:
major_sources

Unnamed: 0,company,srn,address,city,zip_code,pti_no.,approved,notes,oldest_permit_age,oldest_permit_age_converted
69,MENASHA CORPORATION,A0023,320 N FARMER STREET - PAPERBOARD,OTSEGO,49078,152-96B,2001-04-12,Rolled into ROP,21.05,9.00
734,MEAD PAPER COMPANY,A0884,7100 COUNTY ROAD 426,ESCANABA,49829,800-88A,1998-04-23,Rolled into ROP,24.02,9.00
1176,GENERAL MOTORS CORPORATION,A1641,920 TOWNSEND STREET - BOC LANSING,LANSING,48933,69-84A,2000-12-19,Rolled into ROP,21.36,9.00
1430,"KALSEC, INC",A1991,3713 W MAIN STREET,KALAMAZOO,49006,104-95A,1997-01-16,Rolled into ROP,25.28,9.00
2158,E I DUPONT DE NEMOURS & COMPANY,A3569,400 GROESBECK HIGHWAY - MOUNT,MOUNT CLEMENS,48043,646-85A,1997-01-22,Rolled into ROP,25.27,9.00
...,...,...,...,...,...,...,...,...,...,...
1801,KAWASAKI MOTORS CORP USA,P0677,5080 36TH STREET SE,GRAND RAPIDS,49512,230-15,2016-03-10,Rolled into ROP,6.14,6.14
310,TOEFCO ENGINEERED COATING SYSTEMS,P0708,1919 INDUSTRIAL DRIVE,NILES,49120,86-16,2016-07-22,Rolled into ROP,5.77,5.77
2729,"TRINSEO, LLC",P1025,433W BUILDING,MIDLAND,48667,33-16,2016-04-14,Rolled into ROP,6.04,6.04
2730,DDP SPECIALTY ELECTRONIC MATERIALS,P1027,3400 S SAGINAW ROAD - UNIT 96,MIDLAND,48667,167-19,2020-01-27,Rolled into ROP,2.25,2.25


In [180]:
major_doc_df.doc_type = major_doc_df.doc_type.str.replace("FCE1","FCE").str.replace("FCE2","FCE")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [181]:
major_sources

Unnamed: 0,company,srn,address,city,zip_code,pti_no.,approved,notes,oldest_permit_age,oldest_permit_age_converted
69,MENASHA CORPORATION,A0023,320 N FARMER STREET - PAPERBOARD,OTSEGO,49078,152-96B,2001-04-12,Rolled into ROP,21.05,9.00
734,MEAD PAPER COMPANY,A0884,7100 COUNTY ROAD 426,ESCANABA,49829,800-88A,1998-04-23,Rolled into ROP,24.02,9.00
1176,GENERAL MOTORS CORPORATION,A1641,920 TOWNSEND STREET - BOC LANSING,LANSING,48933,69-84A,2000-12-19,Rolled into ROP,21.36,9.00
1430,"KALSEC, INC",A1991,3713 W MAIN STREET,KALAMAZOO,49006,104-95A,1997-01-16,Rolled into ROP,25.28,9.00
2158,E I DUPONT DE NEMOURS & COMPANY,A3569,400 GROESBECK HIGHWAY - MOUNT,MOUNT CLEMENS,48043,646-85A,1997-01-22,Rolled into ROP,25.27,9.00
...,...,...,...,...,...,...,...,...,...,...
1801,KAWASAKI MOTORS CORP USA,P0677,5080 36TH STREET SE,GRAND RAPIDS,49512,230-15,2016-03-10,Rolled into ROP,6.14,6.14
310,TOEFCO ENGINEERED COATING SYSTEMS,P0708,1919 INDUSTRIAL DRIVE,NILES,49120,86-16,2016-07-22,Rolled into ROP,5.77,5.77
2729,"TRINSEO, LLC",P1025,433W BUILDING,MIDLAND,48667,33-16,2016-04-14,Rolled into ROP,6.04,6.04
2730,DDP SPECIALTY ELECTRONIC MATERIALS,P1027,3400 S SAGINAW ROAD - UNIT 96,MIDLAND,48667,167-19,2020-01-27,Rolled into ROP,2.25,2.25


In [182]:
fce_count = pd.crosstab(major_doc_df.source_id, major_doc_df.doc_type)[['FCE']].reset_index()
major_sources = major_sources.merge(fce_count, how="outer",right_on="source_id",left_on="srn")

In [183]:
# No documents for 5 companies. HMMM
major_sources.query('FCE.isnull()')

Unnamed: 0,company,srn,address,city,zip_code,pti_no.,approved,notes,oldest_permit_age,oldest_permit_age_converted,source_id,FCE
17,DOTT MANUFACTURING COMPANY,A6466,3768 N MAIN STREET,DECKERVILLE,48427,390-92B,1999-12-01,Rolled into ROP,22.41,9.0,,
73,DETROIT EDISON COMPANY,B2813,301 GRATIOT BLVD,MARYSVILLE,48040,523-88A,2000-08-09,Rolled into ROP,21.72,9.0,,
149,SMELTZER ENTERPRISES,K0291,10600 GRATIOT AVENUE,DETROIT,48213,62-97,1997-12-01,Rolled into ROP,24.41,9.0,,
264,MICHIGAN SOUTH CENTRAL POWER,N6885,SE CORNER OF E STATE STREET,COLDWATER,49036,320-00,2000-12-01,Rolled into ROP,21.41,9.0,,
294,"TRINSEO, LLC",P1025,433W BUILDING,MIDLAND,48667,33-16,2016-04-14,Rolled into ROP,6.04,6.04,,


In [184]:
zip_count = major_sources.zip_code.value_counts().to_frame().reset_index().rename({'index':'zip_code','zip_code':'major_sources_in_zip'},axis=1)

In [185]:
zip_count

Unnamed: 0,zip_code,major_sources_in_zip
0,49601,7
1,49464,6
2,48640,6
3,49423,6
4,49646,6
...,...,...
187,49659,1
188,48381,1
189,49348,1
190,49733,1


In [186]:
major_sources.merge(zip_count,how='outer',left_on='zip_code',right_on='zip_code')

Unnamed: 0,company,srn,address,city,zip_code,pti_no.,approved,notes,oldest_permit_age,oldest_permit_age_converted,source_id,FCE,major_sources_in_zip
0,MENASHA CORPORATION,A0023,320 N FARMER STREET - PAPERBOARD,OTSEGO,49078,152-96B,2001-04-12,Rolled into ROP,21.05,9.00,A0023,2.0,1
1,MEAD PAPER COMPANY,A0884,7100 COUNTY ROAD 426,ESCANABA,49829,800-88A,1998-04-23,Rolled into ROP,24.02,9.00,A0884,3.0,2
2,DELTA SOLID WASTE MANAGEMENT,N6035,5701 19TH AVENUE NORTH,ESCANABA,49829,30-03,2003-04-17,Rolled into ROP,19.03,9.00,N6035,2.0,2
3,GENERAL MOTORS CORPORATION,A1641,920 TOWNSEND STREET - BOC LANSING,LANSING,48933,69-84A,2000-12-19,Rolled into ROP,21.36,9.00,A1641,4.0,1
4,"KALSEC, INC",A1991,3713 W MAIN STREET,KALAMAZOO,49006,104-95A,1997-01-16,Rolled into ROP,25.28,9.00,A1991,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,"HARBOR FOAM, INC",N7754,2950 PRAIRIE STREET SW - SUITE 300,GRANDVILLE,49418,35-07B,2012-07-30,Rolled into ROP,9.75,9.00,N7754,4.0,1
293,HYUNDAI-KIA AMERICA TECHNICAL,N7886,6800 GEDDES ROAD,SUPERIOR TOWNSHIP,48198,285-08C,2013-02-06,Rolled into ROP,9.22,9.00,N7886,4.0,1
294,VECTOR PIPELINE LP,N8151,"W NE SEC 8, T4S",ATHENS TOWNSHIP,49011,288-08,2008-12-10,Rolled into ROP,13.38,9.00,N8151,4.0,1
295,"PLASAN NORTH AMERICA, INC",P0374,3195 WILSON DRIVE NW,WALKER,49534,35-16,2016-05-10,Rolled into ROP,5.97,5.97,P0374,5.0,1


#### ACS Zip Code Poverty Level for Michigan

In [187]:
pov_df = pd.read_csv("ACS_MI_Zip_12MonthsPovertyLevelIncome_Pct.csv")

In [188]:
pov_df.head()

Unnamed: 0.1,Unnamed: 0,name,Total,"Total, Error",Income Poverty Level,"Income Poverty Level, Error"
0,0,Michigan,9741628,0.0,14.4,0.1
1,1,48001,11818,0.9,9.9,2.0
2,2,48002,3212,1.1,7.7,3.5
3,3,48003,6047,3.0,9.4,3.2
4,4,48005,5439,1.7,4.9,2.0


#### ACS Zip Code Race Demographics for Michigan


In [189]:
race_df = pd.read_csv("ACS_MI_Zip_RacePct.csv")

In [190]:
race_df.head()

Unnamed: 0,name,white_pct,black_pct,am_indian_ak_native_pct,asian_pct,hi_pc_islander_pct,other_pct
0,Michigan,78.4,13.8,0.5,3.1,0.0,4.1
1,48001,97.0,0.1,0.6,0.6,0.1,1.6
2,48002,98.7,0.0,0.0,0.0,0.0,1.3
3,48003,98.7,0.0,0.1,1.0,0.0,0.2
4,48005,97.0,0.8,0.4,0.2,0.0,1.7


## Clean data

#### Renaming columns

In [191]:
pov_df.columns = pov_df.columns.str.lower().str.replace(" ","_")

In [192]:
race_df.columns = race_df.columns.str.lower().str.replace(" ","_")

#### Change zip codes to str

## Creating New Data Frames

#### Compliance Evaluations -- Zip Code & Source

### Merging Data by Source & Demographic Information

In [193]:
major_sources = major_sources.merge(pov_df, how='left',left_on='zip_code',right_on='name').drop(['unnamed:_0','total,_error','income_poverty_level,_error'],axis=1)

In [194]:
major_sources = major_sources.merge(race_df, how='left',left_on='zip_code',right_on='name').drop(['source_id','name_x','name_y'],axis=1)

In [195]:
major_sources

Unnamed: 0,company,srn,address,city,zip_code,pti_no.,approved,notes,oldest_permit_age,oldest_permit_age_converted,FCE,total,income_poverty_level,white_pct,black_pct,am_indian_ak_native_pct,asian_pct,hi_pc_islander_pct,other_pct
0,MENASHA CORPORATION,A0023,320 N FARMER STREET - PAPERBOARD,OTSEGO,49078,152-96B,2001-04-12,Rolled into ROP,21.05,9.00,2.0,9316,10.7,94.5,0.7,0.1,0.6,0.0,4.1
1,MEAD PAPER COMPANY,A0884,7100 COUNTY ROAD 426,ESCANABA,49829,800-88A,1998-04-23,Rolled into ROP,24.02,9.00,3.0,16443,16.3,93.1,0.6,2.1,0.6,0.0,3.6
2,GENERAL MOTORS CORPORATION,A1641,920 TOWNSEND STREET - BOC LANSING,LANSING,48933,69-84A,2000-12-19,Rolled into ROP,21.36,9.00,4.0,2636,39.9,48.1,34.2,1.2,3.3,0.5,12.6
3,"KALSEC, INC",A1991,3713 W MAIN STREET,KALAMAZOO,49006,104-95A,1997-01-16,Rolled into ROP,25.28,9.00,3.0,24556,36.4,74.0,14.4,0.2,4.0,0.0,7.5
4,E I DUPONT DE NEMOURS & COMPANY,A3569,400 GROESBECK HIGHWAY - MOUNT,MOUNT CLEMENS,48043,646-85A,1997-01-22,Rolled into ROP,25.27,9.00,4.0,14857,20.1,70.3,24.6,0.4,0.1,0.0,4.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,KAWASAKI MOTORS CORP USA,P0677,5080 36TH STREET SE,GRAND RAPIDS,49512,230-15,2016-03-10,Rolled into ROP,6.14,6.14,3.0,17528,8.7,62.6,28.4,0.2,6.7,0.2,1.9
293,TOEFCO ENGINEERED COATING SYSTEMS,P0708,1919 INDUSTRIAL DRIVE,NILES,49120,86-16,2016-07-22,Rolled into ROP,5.77,5.77,2.0,35993,14.8,88.5,6.6,0.6,0.6,0.0,3.7
294,"TRINSEO, LLC",P1025,433W BUILDING,MIDLAND,48667,33-16,2016-04-14,Rolled into ROP,6.04,6.04,,0,,,,,,,
295,DDP SPECIALTY ELECTRONIC MATERIALS,P1027,3400 S SAGINAW ROAD - UNIT 96,MIDLAND,48667,167-19,2020-01-27,Rolled into ROP,2.25,2.25,0.0,0,,,,,,,


In [196]:
major_sources.query('total.isnull()')

Unnamed: 0,company,srn,address,city,zip_code,pti_no.,approved,notes,oldest_permit_age,oldest_permit_age_converted,FCE,total,income_poverty_level,white_pct,black_pct,am_indian_ak_native_pct,asian_pct,hi_pc_islander_pct,other_pct


### Getting permit start date and duration

In [95]:
pti_df.head()

Unnamed: 0,company,srn,address,city,zip_code,pti_no.,approved,notes
0,"RIVERSIDE ENERGY MICHIGAN, LLC",N8170,SECTION 32 - MOUNT MARIA CPF,ALCONA TOWNSHIP,48721.0,329-08,12/17/2008,
1,"LAMBDA ENERGY RESOURCES, LLC",N7470,"SW SW NW SEC 10, T28N - CALEDONIA",CALEDONIA TOWNSHIP,49735.0,109-05,7/6/2005,
2,TRENDWELL ENERGY CORPORATION,N7901,"NW NE SEC 20, T8N, R6E - WOLF CREEK",CALEDONIA TOWNSHIP,48762.0,349-07,1/9/2008,
3,RIVERSIDE ENERGY MICHIGAN,N8070,"NW NE SEC 20, T8N, R6E",CALEDONIA TOWNSHIP,48619.0,188-08,6/30/2008,
4,"LAMBDA ENERGY RESOURCES, LLC",N8074,"SW 1/4 SE 1/4 NE 1/4 SEC 8, T28N, R8E -",CALEDONIA TOWNSHIP,49316.0,199-08A,1/14/2011,


In [101]:
from datetime import date
today = date.today()
today = pd.to_datetime(today)
import numpy as np

In [96]:
# Converting permit approval date to datetime
pti_df.approved = pd.to_datetime(pti_df.approved, format="%m/%d/%Y")

In [97]:
# Multiple permits per source so let's take the earliest permit date for each source
pti_approved_date = pti_df.groupby("srn").approved.min().to_frame().reset_index()

In [102]:
# Converting to years
pti_approved_date['years'] = today - pti_approved_date.approved
pti_approved_date.years = pti_approved_date.years/np.timedelta64(1, 'Y')

In [103]:
# I only have 9 years worth of data so if a permit is longer than 9 years 
# we are going to just say 9 to keep the rate consistent with the data
def years_convert(years):
    if years > 9:
        return 9.0
    else: 
        return years

In [104]:
pti_approved_date['years_converted'] = pti_approved_date.years.apply(years_convert)

In [105]:
pti_approved_date

Unnamed: 0,srn,approved,years,years_converted
0,A0023,2001-04-12,20.969630,9.000000
1,A0083,2015-01-14,7.211647,7.211647
2,A0085,2016-04-27,5.927569,5.927569
3,A0098,2004-11-29,17.336427,9.000000
4,A0099,2000-08-08,21.645893,9.000000
...,...,...,...,...
2991,P1249,2021-12-14,0.295694,0.295694
2992,P1251,2022-02-01,0.161537,0.161537
2993,P1252,2022-02-01,0.161537,0.161537
2994,P1253,2022-02-08,0.142371,0.142371


In [108]:
df = df.merge(pti_approved_date,how="left",right_on="srn", left_on="id")

In [110]:
df['fce_per_year'] = df.fce / df.years_converted

In [111]:
df.head()

Unnamed: 0,id,name_x,zip_code,county,full_address,geometry,fce,sar,violations,total,income_poverty_level,white_pct,black_pct,am_indian_ak_native_pct,asian_pct,hi_pc_islander_pct,other_pct,srn,approved,years,years_converted,fce_per_year
0,A6260,ALGONAC CAST PRODUCTS INC,48001,SAINT CLAIR,"9300 STONE ROAD, ALGONAC, MI","[-82.54572189999999, 42.6299997]",0.0,1.0,0.0,11818.0,9.9,97.0,0.1,0.6,0.6,0.1,1.6,A6260,2014-01-23,8.186342,8.186342,0.0
1,N6769,SUNSATION PRODUCTS INC,48001,SAINT CLAIR,"9635 KRETZ DR, ALGONAC, MI","[-82.5700738, 42.6222489]",0.0,2.0,0.0,11818.0,9.9,97.0,0.1,0.6,0.6,0.1,1.6,N6769,2014-09-18,7.53472,7.53472,0.0
2,P1024,ALTA EQUIPMENT COMPANY,48001,KENT,"8840 BYRON COMMERCE DRIVE, BYRON CENTER, MI","[-85.6703763, 42.80414830000001]",0.0,0.0,0.0,11818.0,9.9,97.0,0.1,0.6,0.6,0.1,1.6,P1024,2019-05-13,2.885754,2.885754,0.0
3,P1089,CARL SCHEGEL,48001,JACKSON,"4500 MANN ROAD, CONCORD, MI","[-84.68608139999999, 42.1971472]",0.0,0.0,0.0,11818.0,9.9,97.0,0.1,0.6,0.6,0.1,1.6,P1089,2019-10-29,2.423048,2.423048,0.0
4,P1015,"ENBRIDGE ENERGY, LP",48001,IRON,"SEC 35, T43N, R32W, CRYSTAL FALLS, MI","[-88.3340242, 46.0980066]",0.0,0.0,0.0,11818.0,9.9,97.0,0.1,0.6,0.6,0.1,1.6,P1015,2021-05-25,0.851489,0.851489,0.0


In [112]:
df.to_csv("documents-by-source-demographics.csv",index=False)

#### Merge all zip code level data

In [81]:
zip_df = source_df.zip_code.value_counts().to_frame().reset_index().rename({'index':'zip_code','zip_code':'sources'},axis=1)

In [83]:
# Merge FCE
zip_df = zip_df.merge(zip_fce,how="outer",left_on="zip_code",right_on="zip_code")

In [85]:
# Merge SAR
zip_df = zip_df.merge(zip_sar,how="outer",left_on="zip_code",right_on="zip_code")

In [86]:
# Merge Violations
zip_df = zip_df.merge(zip_vn,how="outer",left_on="zip_code",right_on="zip_code")

In [87]:
# Merge poverty data
zip_df = zip_df.merge(pov_df,how='left',left_on='zip_code',right_on="name")

In [88]:
# Merge race data
zip_df = zip_df.merge(race_df, how="left",left_on="zip_code",right_on="name")

In [92]:
zip_df = zip_df.drop(['unnamed:_0','name_x','total,_error','name_y','income_poverty_level,_error'],axis=1)

In [93]:
zip_df.to_csv("violations-fce-sar-by-zip.csv",index=False)