In [346]:
import pandas as pd
import numpy as np
import scipy.stats as scs

This module we'll be looking at the New York City tree census. This data was provided by a volunteer driven census in 2015, and we'll be accessing it via the socrata API. The main site for the data is [here](https://data.cityofnewyork.us/Environment/2015-Street-Tree-Census-Tree-Data/uvpi-gqnh), and on the upper right hand side you'll be able to see the link to the API.

The data is conveniently available in json format, so we should be able to just read it directly in to Pandas:

In [347]:
url = 'https://data.cityofnewyork.us/resource/nwxe-4ae8.json'
trees = pd.read_json(url)
trees.head(10)

Unnamed: 0,address,bbl,bin,block_id,boro_ct,borocode,boroname,brch_light,brch_other,brch_shoe,...,tree_dbh,tree_id,trnk_light,trnk_other,trunk_wire,user_type,x_sp,y_sp,zip_city,zipcode
0,108-005 70 AVENUE,4022210000.0,4052307.0,348711,4073900,4,Queens,No,No,No,...,3,180683,No,No,No,TreesCount Staff,1027431.0,202756.7687,Forest Hills,11375
1,147-074 7 AVENUE,4044750000.0,4101931.0,315986,4097300,4,Queens,No,No,No,...,21,200540,No,No,No,TreesCount Staff,1034456.0,228644.8374,Whitestone,11357
2,390 MORGAN AVENUE,3028870000.0,3338310.0,218365,3044900,3,Brooklyn,No,No,No,...,3,204026,No,No,No,Volunteer,1001823.0,200716.8913,Brooklyn,11211
3,1027 GRAND STREET,3029250000.0,3338342.0,217969,3044900,3,Brooklyn,No,No,No,...,10,204337,No,No,No,Volunteer,1002420.0,199244.2531,Brooklyn,11211
4,603 6 STREET,3010850000.0,3025654.0,223043,3016500,3,Brooklyn,No,No,No,...,21,189565,No,No,No,Volunteer,990913.8,182202.426,Brooklyn,11215
5,8 COLUMBUS AVENUE,1011310000.0,1076229.0,106099,1014500,1,Manhattan,No,No,No,...,11,190422,No,No,No,Volunteer,988418.7,219825.5227,New York,10023
6,120 WEST 60 STREET,1011310000.0,1076229.0,106099,1014500,1,Manhattan,No,No,No,...,11,190426,No,No,No,Volunteer,988311.2,219885.2785,New York,10023
7,311 WEST 50 STREET,1010410000.0,1086093.0,103940,1012700,1,Manhattan,No,No,No,...,9,208649,No,No,No,Volunteer,987769.1,217157.8561,New York,10019
8,65 JEROME AVENUE,,,407443,5006400,5,Staten Island,No,No,No,...,6,209610,No,No,No,TreesCount Staff,963073.2,156635.5542,Staten Island,10305
9,638 AVENUE Z,3072350000.0,3320727.0,207508,3037402,3,Brooklyn,No,No,No,...,21,192755,No,No,No,TreesCount Staff,992653.7,152903.6306,Brooklyn,11223


In [348]:
city = pd.DataFrame()
boros = ["'Bronx'","'Staten Island'", "'Manhattan'", "'Queens'", "'Brooklyn'"]
for boro in boros:
    soql_url = ('https://data.cityofnewyork.us/resource/nwxe-4ae8.json?' +\
            '$select=health,count(tree_id)' +\
            '&$where=boroname='+boro +\
            'AND NOT health=\'NaN\'' +\
            '&$group=health').replace(' ', '%20')

    soql_health = pd.read_json(soql_url)
    soql_health['boroname'] = boro
    city = city.append(soql_health)

city


Unnamed: 0,count_tree_id,health,boroname
0,66603,Good,'Bronx'
1,10887,Fair,'Bronx'
2,3095,Poor,'Bronx'
0,82669,Good,'Staten Island'
1,14535,Fair,'Staten Island'
2,4238,Poor,'Staten Island'
0,47358,Good,'Manhattan'
1,11460,Fair,'Manhattan'
2,3609,Poor,'Manhattan'
0,34549,Fair,'Queens'


In [349]:
# With steward, Good Heatlth

city2 = pd.DataFrame()
boros = ["'Bronx'","'Staten Island'", "'Manhattan'", "'Queens'", "'Brooklyn'"]
for boro in boros:
    soql_url2 = ('https://data.cityofnewyork.us/resource/nwxe-4ae8.json?' +\
        '$select=steward, count(health)' +\
        '&$where=boroname='+boro +\
        'AND NOT health=\'NaN\'' +\
        'AND NOT steward=\'NaN\'' +\
        'AND NOT steward=\'None\'' +\
        'AND NOT health=\'Poor\'' +\
        '&$group=steward').replace(' ', '%20')
    soql_steward = pd.read_json(soql_url2)
    soql_steward['boroname'] = boro
    city2 = city2.append(soql_steward)
city2 = city2.replace('3or4', 1)
city2 = city2.replace('4orMore', 1)
city2 = city2.replace('1or2', 1)

city2 = city2.groupby('boroname').sum()
city2 = city2.drop(columns = 'steward')
city2

Unnamed: 0_level_0,count_health
boroname,Unnamed: 1_level_1
'Bronx',15051
'Brooklyn',48669
'Manhattan',30637
'Queens',43301
'Staten Island',19613


In [350]:
# No Steward, Good Health

city3 = pd.DataFrame()
boros = ["'Bronx'","'Staten Island'", "'Manhattan'", "'Queens'", "'Brooklyn'"]
for boro in boros:
    soql_url2 = ('https://data.cityofnewyork.us/resource/nwxe-4ae8.json?' +\
        '$select=steward, count(health)' +\
        '&$where=boroname='+boro +\
        'AND NOT health=\'NaN\'' +\
        'AND steward=\'None\'' +\
        'AND NOT steward=\'NaN\'' +\
        'AND NOT health=\'Poor\'' +\
        '&$group=steward').replace(' ', '%20')
    soql_steward = pd.read_json(soql_url2)
    soql_steward['boroname'] = boro
    city3 = city3.append(soql_steward)

city3 = city3.replace({'None': 0})
city3 = city3.groupby('boroname').sum()
city3 = city3.drop(columns = 'steward')
city3


Unnamed: 0_level_0,count_health
boroname,Unnamed: 1_level_1
'Bronx',62439
'Brooklyn',114616
'Manhattan',28181
'Queens',185256
'Staten Island',77591


In [351]:
# With steward, Bad Health

city4 = pd.DataFrame()
boros = ["'Bronx'","'Staten Island'", "'Manhattan'", "'Queens'", "'Brooklyn'"]
for boro in boros:
    soql_url2 = ('https://data.cityofnewyork.us/resource/nwxe-4ae8.json?' +\
        '$select=steward, count(health)' +\
        '&$where=boroname='+boro +\
        'AND NOT health=\'NaN\'' +\
        'AND NOT steward=\'NaN\'' +\
        'AND NOT steward=\'None\'' +\
        'AND health=\'Poor\'' +\
        '&$group=steward').replace(' ', '%20')
    soql_steward = pd.read_json(soql_url2)
    soql_steward['boroname'] = boro
    city4 = city4.append(soql_steward)
city4 = city4.replace('3or4', 1)
city4 = city4.replace('4orMore', 1)
city4 = city4.replace('1or2', 1)

city4 = city4.groupby('boroname').sum()
city4 = city4.drop(columns = 'steward')
city4

Unnamed: 0_level_0,count_health
boroname,Unnamed: 1_level_1
'Bronx',683
'Brooklyn',1791
'Manhattan',1909
'Queens',1972
'Staten Island',724


In [394]:
# No Steward, Bad Health

city5 = pd.DataFrame()
boros = ["'Bronx'","'Staten Island'", "'Manhattan'", "'Queens'", "'Brooklyn'"]
for boro in boros:
    soql_url2 = ('https://data.cityofnewyork.us/resource/nwxe-4ae8.json?' +\
        '$select=steward, count(health)' +\
        '&$where=boroname='+boro +\
        'AND NOT health=\'NaN\'' +\
        'AND steward=\'None\'' +\
        'AND NOT steward=\'NaN\'' +\
        'AND health=\'Poor\'' +\
        '&$group=steward').replace(' ', '%20')
    soql_steward = pd.read_json(soql_url2)
    soql_steward['boroname'] = boro
    city5 = city5.append(soql_steward)

city5 = city5.replace({'None': 0})
city5 = city5.groupby('boroname').sum()
city5 = city5.drop(columns = 'steward')
city5

Unnamed: 0_level_0,count_health
boroname,Unnamed: 1_level_1
'Bronx',2412
'Brooklyn',4668
'Manhattan',1700
'Queens',7445
'Staten Island',3514


In [398]:
boro = "'Bronx'"

steward_good = city2
steward_bad = city4

no_good = city3
no_bad = city5
big = pd.DataFrame()

steward_good


big['steward good'] = steward_good['count_health']
big['steward bad']  = steward_bad['count_health']
big['no steward good'] = no_good['count_health']
big['no steward bad']  = no_bad['count_health']

big = big.transpose()
big



boroname,'Bronx','Brooklyn','Manhattan','Queens','Staten Island'
steward good,15051,48669,30637,43301,19613
steward bad,683,1791,1909,1972,724
no steward good,62439,114616,28181,185256,77591
no steward bad,2412,4668,1700,7445,3514


In [399]:
obs = [big[boro][0], big[boro][1] ]
exp = [big[boro][2], big[boro][3] ]

obs_total = np.sum(obs)
exp_total = np.sum(exp)

obs = obs/obs_total * 100
exp = exp/exp_total * 100

result = scs.chisquare(obs,exp)

Power_divergenceResult(statistic=0.10790806987169146, pvalue=0.7425384541871657)

In [402]:
import plotly
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='meyerscr', api_key='lTe7HwU3PJu5CUO7DNPQ')
trace1 = go.Bar(
    x=["Good Health", "Bad Health"],
    y= [big[boro][0], big[boro][1]],
    name='With Steward'
)
trace2 = go.Bar(
    x= ["Good Health", "Bad Health"],
    y= [big[boro][2], big[boro][3]],
    name='Without Steward'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group',
    xaxis = dict(
    title = 'Trees in Good and Poor Health'
    ),
    yaxis = dict(
    title = 'Number of Trees in Selected Boro'
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')

In [328]:
exp = city2['count_health']
obs = city3['count_health']
result = scs.chisquare(obs, exp)
result

Power_divergenceResult(statistic=18.60157247958606, pvalue=0.000941001321982332)

We have sufficient evidence to suggest that stewardship is useful in promoting plant health