In [10]:
from __future__ import print_function, division

# Standard imports to work with datasets, plots, etc.
import pylab as pl
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import json
import requests
import urllib2
from pandas.tools.plotting import scatter_matrix
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.interpolate import *

#loading rcParams from JSON template (created from Federica's fbb_matplotlibrc.json)
url = 'https://s3.amazonaws.com/sb-public/sbg389_matplotlibrc.json'
resp = requests.get(url=url)
data = json.loads(resp.text)
# update the rcParams object
pl.rcParams.update(data)

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [None]:
# Getting and move things around with bash commands
# Download the manhattan pulto shape file
# Created a MNMapPluto folder under PUIData to put all the Pluto Files

# Use os.system to invoke bash commands to download (curl) unzip and move file

# Curl -O Write output to a local file named like the remote file we get
os.system("curl -O https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/mn_mappluto_16v1.zip")

# Unzip the files and move to the MNMapPluto subfolder (they all start with MN)
os.system("unzip -jn mn_mappluto_16v1.zip MN*")

#Move files to our PUIDATA
os.system("mv " + "MN* " + os.getenv("PUIDATA") + "/MNMapPluto")

# Read the Manhattan SHP file that was obtained from the PLUTO dataset
# geopandas (or pandas, its the same) reading from local file
bsize = gpd.read_file (os.getenv("PUIDATA") + "/MNMapPluto/MNMapPLUTO.shp")
bsize.head(2)


In [22]:
# Read a csv directly from the web (both geopandas and pandas are ok with it)
nrg = pd.read_csv("https://data.cityofnewyork.us/api/views/rgfe-8y2z/rows.csv")

In [14]:
# Obtaining Data from a JSON API, putting it into a dictionary and reading a specific value

MTAKEY = ''
BUSLINE = 'B62'

#Build the URI for the API Call concatenating the key and bus line from the argument parameters
url = "http://bustime.mta.info/api/siri/vehicle-monitoring.json?key=%s&VehicleMonitoringDetailLevel=calls&" \
      "LineRef=%s"%(MTAKEY, BUSLINE)

#Get the response and load the string representation into a dictionary
response = urllib2.urlopen(url)
mtadataString = response.read().decode("utf-8")
mtadata = json.loads(mtadataString)

vehicleActivityArray = mtadata['Siri']['ServiceDelivery']['VehicleMonitoringDelivery']
numberOfActiveBuses = len(vehicleActivityArray[0]['VehicleActivity'])

# print (vehicleActivityArray)
print (numberOfActiveBuses)

7


In [36]:
# Clean up the dataframe: Rename and List columns (so we can drop etc.)

# List returns a list with the column names, that can be used to drop, select, etc.
print (list (nrg.columns))

# Rename Columns (so they can be called using dots and avoid errros with non ascii chars on the names (if any))
#change Borough Block and Lot to BBL to work as a merge key 
nrg.rename(columns={'NYC Borough, Block, and Lot (BBL)': 'BBL'}, inplace=True)
nrg.rename(columns={'Reported Property Floor Area (Building(s)) (ft²)': 'reportedArea'}, inplace=True)

print ('')
print (list (nrg.columns))

# Select columns from the dataframe (rather than dropping)

# Select the columns needed from nrg
nrg = nrg[['BBL', 'Site EUI(kBtu/ft2)', 'reportedArea']]
print (list(nrg.columns))

['BBL', 'Site EUI(kBtu/ft2)', 'reportedArea']

['BBL', 'Site EUI(kBtu/ft2)', 'reportedArea']
['BBL', 'Site EUI(kBtu/ft2)', 'reportedArea']


In [46]:
# Cleanup dataframe II eliminate non numeric / missing vlaues

# Using pd.to_numeric with the coerce option rather than custom convert 
# This replaces non numeric values with a NaN
nrg['Site EUI(kBtu/ft2)'] = pd.to_numeric(nrg['Site EUI(kBtu/ft2)'], errors='coerce')
nrg['reportedArea'] = pd.to_numeric(nrg['reportedArea'], errors='coerce')
nrg.head()

# Now we eliminate NaNs from the rows (if at least one column is NaN, we drop the row)

# On all columns
nrg.dropna(axis=0, inplace=True )

# On Specific columns
nrg.dropna(subset = ['reportedArea', 'BBL'])

In [54]:
# Cleanup dataframe III broadcasting and selecting values that match certain condition
# Using log

# Creating a mask array of booleans with the indexes of the rows that match the condition
nrgMask = ((nrg.reportedArea > 300000) & (nrg.reportedArea < 500000))

# Hago un broadcast usando el mask array como selector
nrg[nrgMask]

# We can do this directly also, but the code is less readable, specially if the boolean conditions are 
# Long
nrg[(nrg.reportedArea > 300000) & (nrg.reportedArea < 500000)]

# We use log10 when we have either an X or Y variable that is N orders of magnitude 
# When the data is too "close to the axis"
nrg['log10reportedArea'] = log10(nrg.reportedArea)
nrg.head()

Unnamed: 0,BBL,Site EUI(kBtu/ft2),reportedArea,log10reportedArea
16,1013110000.0,125.4,330000.0,5.518514
30,2022150000.0,6224.5,342048.0,5.534087
44,1010140000.0,154.9,841709.0,5.925162
45,4002520000.0,46.0,390400.0,5.59151
56,4004370000.0,66.2,528060.0,5.722683
