In [7]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
import os
os.environ['PYSPARK_PYTHON'] = '/anaconda/bin/python'

In [3]:
import findspark
findspark.init()
print findspark.find()
# Depending on your setup you might have to change this line of code
#findspark makes sure I dont need the below on homebrew.
#os.environ['SPARK_HOME']="/usr/local/Cellar/apache-spark/1.5.1/libexec/"
#the below actually broke my spark, so I removed it. 
#Depending on how you started the notebook, you might need it.
#os.environ['PYSPARK_SUBMIT_ARGS']="--master local pyspark --executor-memory 4g"

/usr/local/opt/apache-spark/libexec


In [4]:
import pyspark
conf = (pyspark.SparkConf()
    .setMaster('local')
    .setAppName('pyspark')
    .set("spark.driver.memory", "5g"))
sc = pyspark.SparkContext(conf=conf)

In [5]:
from pyspark.sql import SQLContext
sqlsc=SQLContext(sc)

In [6]:
raw_data = sc.textFile('yellow_tripdata_2015-02.csv').map(lambda line: tuple(line.split(','))).zipWithIndex()

In [7]:
column_names = raw_data.take(1)[0]
print column_names

((u'VendorID', u'tpep_pickup_datetime', u'tpep_dropoff_datetime', u'passenger_count', u'trip_distance', u'pickup_longitude', u'pickup_latitude', u'RateCodeID', u'store_and_fwd_flag', u'dropoff_longitude', u'dropoff_latitude', u'payment_type', u'fare_amount', u'extra', u'mta_tax', u'tip_amount', u'tolls_amount', u'improvement_surcharge', u'total_amount'), 0)


In [16]:
def col_extracter(row):
    # helper function that strips the index and grabs the following columns:
    # u'tpep_pickup_datetime', u'tpep_dropoff_datetime', u'pickup_longitude', u'pickup_latitude', u'dropoff_longitude', u'dropoff_latitude'
    return (row[1], row[2], row[5], row[6], row[9], row[10])
        

In [17]:
#get rid of header row and indices
data = raw_data.filter(lambda (row,index): index > 0).keys().map(col_extracter)

In [18]:
data.take(1)

[(u'2015-02-08 11:33:46',
  u'2015-02-08 11:37:45',
  u'-73.949905395507813',
  u'40.717437744140625',
  u'-73.95001220703125',
  u'40.724010467529297')]

In [2]:
'''
This is data collected from http://www.findlatitudeandlongitude.com/?loc=manhattan%2C+New+York#.VlVCnmSrRL8
Each coordinate corresponds a the corner of a square city block in manhattan.  
Using this data, we will discretize manhattan into enumerated blocks
NW Corner:
    Lat: 40.769155
    Long: -73.963201
NE Corner:
    Lat: 40.768448
    Long: -73.961517
SE Corner:
    Lat: 40.767148
    Long: -73.962466
SW Corner:
    Lat: 40.767855
    Long: -73.964129
'''
#each cordinate is (Latitude, Longitude)
block_NW = (40.769155, -73.963201)
block_NE = (40.768448, -73.961517)
block_SE = (40.767148, -73.962466)
block_SW = (40.767855, -73.964129)

# Calculate width of a block by averaging estimated difference in longitude between east and west corners
# Note that moving East is positive, moving North is positive
block_w = abs(block_NW[1] - block_NE[1] + block_SW[1] - block_SE[1])/2
# Calculate width of a block by averaging estimated difference in longitude between south and north corners
block_h = abs(block_NW[0] - block_SW[0] + block_NE[0] - block_SE[0])/2
print "Block Width:", block_w
print "Block Height:", block_h

Block Width: 0.0016735
Block Height: 0.0013


In [13]:
'''
Now we determine the latitude and longitude boundaries for NYC so that we can enumerate the blocks
Data collected from http://www.findlatitudeandlongitude.com/?loc=manhattan%2C+New+York#.VlVCnmSrRL8
Boundaries were selected in include all 3 major airports.

SW Corner:
    Lat: 40.619686
    Long: -74.229813
SE Corner: 
    Lat: 40.619686
    Long: -73.729248
NE Corner:
    Lat: 40.874065
    Long: -73.729248
NW Corner:
    Lat: 40.874065
    Long: -74.229813
'''    
city_NW = (40.874065, -74.229813)
city_NE = (40.874065, -73.729248)
city_SE = (40.619686, -73.729248)
city_SW = (40.619686, -74.229813)
#adjust estimated city corners so that the height and width are an integer number of blocks
num_h_blocks = abs(np.ceil((city_NW[0]-city_SW[0])/block_h))
num_w_blocks = abs(np.ceil((city_NW[1]-city_NE[1])/block_w))
print "Height of city (in blocks)", num_h_blocks
print "Width of city (in blocks)", num_w_blocks
city_NE = (city_SE[0]+num_h_blocks * block_h, city_SE[1])
city_SW = (city_SE[0], city_SE[1] - num_w_blocks * block_w)
city_NW = (city_NE[0], city_SW[1])
print "City NW:", city_NW
print "City NE:", city_NE
print "City SE:", city_SE
print "City SW:", city_SW

Height of city (in blocks) 196.0
Width of city (in blocks) 299.0
City NW: (40.874486000000104, -74.229624499998636)
City NE: (40.874486000000104, -73.729248)
City SE: (40.619686, -73.729248)
City SW: (40.619686, -74.229624499998636)


In [None]:
# Now for each pickup location and dropoff location we want to assign an index, which corresponds to the block
# that the pickup/dropoff occurs in.
# To do this, we will enumerate all blocks starting with 0 at SE corner.
# Note we start at the SE corner so that indices increase as longitude and latitude increase