In [72]:
#setting up my usual packages
from __future__ import print_function, division
import os
import numpy as np
import pandas as pd
import sys
import operator
from scipy import stats
import requests
from IPython.display import HTML
import matplotlib as plt
import pylab as pl
import scipy as sp
import scipy.stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import geopandas as gpd
from shapely.geometry import Point
from fiona.crs import from_epsg
import geopandas.tools
from pandas import DataFrame
import json

try: 
    import urllib2 as urllib
except ImportError:
    import urllib.request as urllib
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [43]:
PUIdata = os.getenv('PUIDATA')
print (PUIdata)

/home/cusp/rxl204/PUIdata


## Read in Data
For this study, I will be using the following sources of data: 
1. Street Tree Census
2. Asthma Discharge Rates
3. DOB Construction Permits
4. ACS Household Median Income 
5. 311 Cleanliness Related Complaints

### 1. Street Tree Census

In [44]:
url = 'https://data.cityofnewyork.us/api/views/uvpi-gqnh/rows.csv?accessType=DOWNLOAD'
filename = 'tree.csv'

if not os.path.isfile(PUIdata + '/' + filename):
    print('Downloading...')
    os.system('wget ' + url)
    os.system('mv rows.csv?accessType=DOWNLOAD ' + filename)
    os.system('mv ' + filename + ' ' + PUIdata)
    if os.path.isfile(PUIdata + '/' + filename):
        print('File in place, proceed!')
else:
    print('File in place, proceed!')


File in place, proceed!


In [45]:
#read files
df = pd.read_csv(PUIdata + '/tree.csv')
df.head()

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,boro_ct,state,latitude,longitude,x_sp,y_sp,council district,census tract,bin,bbl
0,180683,348711,08/27/2015,3,0,OnCurb,Alive,Fair,Acer rubrum,red maple,...,4073900,New York,40.723092,-73.844215,1027431.148,202756.7687,29.0,739.0,4052307.0,4022210000.0
1,200540,315986,09/03/2015,21,0,OnCurb,Alive,Fair,Quercus palustris,pin oak,...,4097300,New York,40.794111,-73.818679,1034455.701,228644.8374,19.0,973.0,4101931.0,4044750000.0
2,204026,218365,09/05/2015,3,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,3044900,New York,40.717581,-73.936608,1001822.831,200716.8913,34.0,449.0,3338310.0,3028870000.0
3,204337,217969,09/05/2015,10,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,3044900,New York,40.713537,-73.934456,1002420.358,199244.2531,34.0,449.0,3338342.0,3029250000.0
4,189565,223043,08/30/2015,21,0,OnCurb,Alive,Good,Tilia americana,American linden,...,3016500,New York,40.666778,-73.975979,990913.775,182202.426,39.0,165.0,3025654.0,3010850000.0


In [46]:
#check most common tree types
tree_sort = df.spc_common.value_counts().reset_index().sort(ascending=[False])
tree_sort.columns = ['Tree Type', 'Count']
tree_sort.head(5)

  from ipykernel import kernelapp as app


Unnamed: 0,Tree Type,Count
0,London planetree,87014
1,honeylocust,64264
2,Callery pear,58931
3,pin oak,53185
4,Norway maple,34189


In [47]:
tree_sort['ID'] = tree_sort.index
tree_sort.head()

Unnamed: 0,Tree Type,Count,ID
0,London planetree,87014,0
1,honeylocust,64264,1
2,Callery pear,58931,2
3,pin oak,53185,3
4,Norway maple,34189,4


In [48]:
tree_id = tree_sort[['Tree Type', 'ID']]
tree_id.head()

Unnamed: 0,Tree Type,ID
0,London planetree,0
1,honeylocust,1
2,Callery pear,2
3,pin oak,3
4,Norway maple,4


In [49]:
#merge back to df to assign ID to tree type 
df = df.merge(tree_id, left_on='spc_common', right_on='Tree Type')
df.columns

Index([u'tree_id', u'block_id', u'created_at', u'tree_dbh', u'stump_diam',
       u'curb_loc', u'status', u'health', u'spc_latin', u'spc_common',
       u'steward', u'guards', u'sidewalk', u'user_type', u'problems',
       u'root_stone', u'root_grate', u'root_other', u'trunk_wire',
       u'trnk_light', u'trnk_other', u'brch_light', u'brch_shoe',
       u'brch_other', u'address', u'postcode', u'zip_city', u'community board',
       u'borocode', u'borough', u'cncldist', u'st_assem', u'st_senate', u'nta',
       u'nta_name', u'boro_ct', u'state', u'latitude', u'longitude', u'x_sp',
       u'y_sp', u'council district', u'census tract', u'bin', u'bbl',
       u'Tree Type', u'ID'],
      dtype='object')

In [50]:
#filter data to include only plane trees
df = df.loc[df['spc_common'] == 'London planetree']
df.head()


Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,latitude,longitude,x_sp,y_sp,council district,census tract,bin,bbl,Tree Type,ID
148225,192755,207508,08/31/2015,21,0,OffsetFromCurb,Alive,Fair,Platanus x acerifolia,London planetree,...,40.586357,-73.969744,992653.7,152903.6306,47.0,37402.0,3320727.0,3072350000.0,London planetree,0
148226,203719,302371,09/05/2015,11,0,OnCurb,Alive,Good,Platanus x acerifolia,London planetree,...,40.782428,-73.911171,1008850.0,224349.0366,22.0,105.0,4019061.0,4008710000.0,London planetree,0
148227,203726,302371,09/05/2015,8,0,OnCurb,Alive,Poor,Platanus x acerifolia,London planetree,...,40.781735,-73.91202,1008615.0,224096.274,22.0,105.0,4019059.0,4008710000.0,London planetree,0
148228,195202,415896,09/01/2015,13,0,OnCurb,Alive,Fair,Platanus x acerifolia,London planetree,...,40.557103,-74.16267,939048.0,142285.9579,51.0,14607.0,5072852.0,5054910000.0,London planetree,0
148229,189465,219493,08/30/2015,22,0,OnCurb,Alive,Good,Platanus x acerifolia,London planetree,...,40.694733,-73.968211,993065.3,192388.0651,35.0,191.0,3054331.0,3018880000.0,London planetree,0


In [51]:
dfg= df.groupby('postcode')['spc_common'].count()
dfg.head()

postcode
83        54
10001     21
10002    388
10003    133
10004      8
Name: spc_common, dtype: int64

In [52]:
dfg = dfg.to_frame().reset_index()
dfg.head()

Unnamed: 0,postcode,spc_common
0,83,54
1,10001,21
2,10002,388
3,10003,133
4,10004,8


### 2. Asthma Discharge Rates

In [None]:
asthma.to_csv('asthma.csv', encoding='utf-8')

In [None]:
dfas = pd.read_csv('data/asthma.csv')
dfas.head()

### 3. DOB Construction Permits

In [53]:
df_con = pd.read_csv('https://data.cityofnewyork.us/api/views/upjv-ar2g/rows.csv?accessType=DOWNLOAD')
df_con.head()

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job doc. #,Job Type,Self_Cert,Block,Lot,...,Owner’s House State,Owner’s House Zip Code,Owner's Phone #,DOBRunDate,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
0,QUEENS,4311785,87-77,PARSONS BLVD.,440154440,1,A2,Y,9765,41,...,NY,11354,7183539000.0,11/03/2017 12:00:00 AM,2745274,40.707312,-73.802539,24.0,44602.0,Jamaica
1,MANHATTAN,1089326,49,ANN STREET,120029511,1,A2,N,92,14,...,NY,10018,2123343000.0,11/03/2017 12:00:00 AM,2833706,40.710306,-74.006828,1.0,1501.0,Battery Park City-Lower Manhattan
2,QUEENS,4200517,118-04,101 AVE,420829125,1,A2,N,9486,2,...,NY,11420,7184407000.0,11/03/2017 12:00:00 AM,2573400,40.689746,-73.826697,28.0,106.0,South Ozone Park
3,MANHATTAN,1014236,282,7TH AVENUE,121895280,2,A2,Y,776,40,...,NY,10001,2122178000.0,11/03/2017 12:00:00 AM,2696921,40.746147,-73.994168,3.0,95.0,Midtown-Midtown South
4,MANHATTAN,1087167,233,BROADWAY,121402981,1,A2,Y,123,7501,...,NY,10004,2122333000.0,11/03/2017 12:00:00 AM,2453788,40.712265,-74.007892,1.0,21.0,SoHo-TriBeCa-Civic Center-Little Italy


In [54]:
df_con.columns

Index([u'BOROUGH', u'Bin #', u'House #', u'Street Name', u'Job #',
       u'Job doc. #', u'Job Type', u'Self_Cert', u'Block', u'Lot',
       u'Community Board', u'Zip Code', u'Bldg Type', u'Residential',
       u'Special District 1', u'Special District 2', u'Work Type',
       u'Permit Status', u'Filing Status', u'Permit Type',
       u'Permit Sequence #', u'Permit Subtype', u'Oil Gas', u'Site Fill',
       u'Filing Date', u'Issuance Date', u'Expiration Date', u'Job Start Date',
       u'Permittee's First Name', u'Permittee's Last Name',
       u'Permittee's Business Name', u'Permittee's Phone #',
       u'Permittee's License Type', u'Permittee's License #',
       u'Act as Superintendent', u'Permittee's Other Title', u'HIC License',
       u'Site Safety Mgr's First Name', u'Site Safety Mgr's Last Name',
       u'Site Safety Mgr Business Name', u'Superintendent First & Last Name',
       u'Superintendent Business Name', u'Owner's Business Type',
       u'Non-Profit', u'Owner's Business

In [55]:
df_con['Job Start Date'] = pd.to_datetime(df_con['Job Start Date'])

In [56]:
df_con['Expiration Date'] = pd.to_datetime(df_con['Expiration Date'])

In [57]:
df_con['construction_days'] = df_con['Expiration Date'] - df_con['Job Start Date']
df_con.head()

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job doc. #,Job Type,Self_Cert,Block,Lot,...,Owner’s House Zip Code,Owner's Phone #,DOBRunDate,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME,construction_days
0,QUEENS,4311785,87-77,PARSONS BLVD.,440154440,1,A2,Y,9765,41,...,11354,7183539000.0,11/03/2017 12:00:00 AM,2745274,40.707312,-73.802539,24.0,44602.0,Jamaica,219 days
1,MANHATTAN,1089326,49,ANN STREET,120029511,1,A2,N,92,14,...,10018,2123343000.0,11/03/2017 12:00:00 AM,2833706,40.710306,-74.006828,1.0,1501.0,Battery Park City-Lower Manhattan,1471 days
2,QUEENS,4200517,118-04,101 AVE,420829125,1,A2,N,9486,2,...,11420,7184407000.0,11/03/2017 12:00:00 AM,2573400,40.689746,-73.826697,28.0,106.0,South Ozone Park,365 days
3,MANHATTAN,1014236,282,7TH AVENUE,121895280,2,A2,Y,776,40,...,10001,2122178000.0,11/03/2017 12:00:00 AM,2696921,40.746147,-73.994168,3.0,95.0,Midtown-Midtown South,336 days
4,MANHATTAN,1087167,233,BROADWAY,121402981,1,A2,Y,123,7501,...,10004,2122333000.0,11/03/2017 12:00:00 AM,2453788,40.712265,-74.007892,1.0,21.0,SoHo-TriBeCa-Civic Center-Little Italy,200 days


In [58]:
df_con['construction_days'] = df_con['construction_days'].dt.days

In [63]:
df_con['Zip Code'].dropna(inplace=True);

In [64]:
df_con['Zip Code'] = df_con['Zip Code'].astype('int');

In [65]:
df_con = df_con[['construction_days', 'Zip Code']]

In [66]:
df_condays = df_con.groupby(['Zip Code'], as_index=False)['construction_days'].sum()

In [68]:
df_condays['Zip Code'] = df_condays['Zip Code'].astype(int)
df_condays.head()

Unnamed: 0,Zip Code,construction_days
0,10000,10143.0
1,10001,3902072.0
2,10002,2157231.0
3,10003,3924700.0
4,10004,1088428.0


### 4. ACS Household Median Income

In [78]:
import readline
censusAPI = os.getenv('PUIDATA') + '/FactFinderAPI.txt'
f = open(os.getenv('PUIDATA') + '/FactFinderAPI.txt')
myAPI = f.readline()

In [73]:
#read in in the variables available. the info you need is in the 1year ACS data
url = "https://api.census.gov/data/2014/acs5/variables.json"
resp = requests.request('GET', url)
aff1y = json.loads(resp.text)

In [74]:
#turning things into arrays to enable broadcasting
#Python3
affkeys = np.array(list(aff1y['variables'].keys()))
#Python2
#affkeys = np.array(aff1y['variables'].keys())

In [75]:
#extracting variables of B07011 that contain Median Income
[(k, aff1y['variables'][k]['label'])  for k in affkeys if k.startswith ("B07011") and 
 'Median income' in aff1y['variables'][k]['label']]

[(u'B07011_005E',
  u'Median income in the past 12 months --!! Moved from different state'),
 (u'B07011_005M',
  u'Margin of Error for!!Median income in the past 12 months --!! Moved from different state'),
 (u'B07011PR_002E',
  u'Median income in the past 12 months --!! Same house 1 year ago'),
 (u'B07011PR_002M',
  u'Margin of Error for!!Median income in the past 12 months --!! Same house 1 year ago'),
 (u'B07011_002M',
  u'Margin of Error for!!Median income in the past 12 months --!! Same house 1 year ago'),
 (u'B07011_002E',
  u'Median income in the past 12 months --!! Same house 1 year ago'),
 (u'B07011PR_005E',
  u'Median income in the past 12 months --!! Moved from the United States'),
 (u'B07011PR_005M',
  u'Margin of Error for!!Median income in the past 12 months --!! Moved from the United States'),
 (u'B07011PR_006M',
  u'Margin of Error for!!Median income in the past 12 months --!! Moved from elsewhere'),
 (u'B07011PR_006E',
  u'Median income in the past 12 months --!! Moved

In [76]:
#keyword for the number of households
keyNhouseholds = 'B07011_001E'
aff1y['variables'][keyNhouseholds]

{u'concept': u'B07011. Median Income in the Past 12 Months (in 2014 Inflation-Adjusted Dollars) by Geographical Mobility in the Past Year for Current Residence in the United States',
 u'group': u'N/A',
 u'label': u'Median income in the past 12 months --!!Total:',
 u'limit': 0,
 u'predicateType': u'int',
 u'validValues': []}

In [77]:
#keyword for the number of households with broarband access
keyNBB = 'B07011_001E'
aff1y['variables'][keyNBB]

{u'concept': u'B07011. Median Income in the Past 12 Months (in 2014 Inflation-Adjusted Dollars) by Geographical Mobility in the Past Year for Current Residence in the United States',
 u'group': u'N/A',
 u'label': u'Median income in the past 12 months --!!Total:',
 u'limit': 0,
 u'predicateType': u'int',
 u'validValues': []}

In [None]:
# getting the broadband access number of households
url = "https://api.census.gov/data/2014/acs/acs1?get=" + keyNBB +\
",NAME&for=public%20use%20microdata%20area:*&in=state:36&key=" + myAPI
resp = requests.request('GET', url).content
pumaBB = pd.read_csv(io.StringIO(resp.decode('utf-8').replace('[','').replace(']','')))

pumaBB.head()

In [None]:
http://api.census.gov/data/2010/sf1?key={enter key here}&get=P0010001,P0040003&for=zip+code+tabulation+area:*&in=state:02

In [None]:
javascript:showDownloadDialog(downloadValidator)

### 5. 311 Cleanliness Related Complaints