# Notes for the project: learning/playing with the spatial data

In [7]:
#%matplotlib inline
import numpy as np
import scipy as sp
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)


In [31]:
!pip install geopy

Collecting geopy
  Downloading geopy-1.11.0-py2.py3-none-any.whl (66kB)
Installing collected packages: geopy
Successfully installed geopy-1.11.0


In [33]:
import geopy as gp

## Notes on processing geo spatial data

We assume the NYC taxi data set has the lat/long entries based on WGS 84 (since this is GPS data)

The table below shows the dimensions of geohash cells at the worst-case scenario at the equator.

Source: [Wikipedia](https://en.wikipedia.org/wiki/Geohash)



| Geohash Length | Width of cell | Height of cell |
|----------------|---------------|----------------|
| 1              | 5,009.4km     | 4,992.6km      |
| 2              | 1,252.3km     | 624.1km        |
| 3              | 156.5km       | 156km          |
| 4              | 39.1km        | 19.5km         |
| 5              | 4.9km         | 4.9km          |
| 6              | 1.2km         | 609.4m         |
| 7              | 152.9m        | 152.4m         |
| 8              | 38.2m         | 19m            |
| 9              | 4.8m          | 4.8m           |
| 10             | 1.2m          | 59.5cm         |
| 11             | 14.9cm        | 14.9cm         |
| 12             | 3.7cm         | 1.9cm          |

In [25]:
import geohash as gh

sampleLat = 40.750111
sampleLong = -73.993896

for precision in range (1,13):
    print gh.encode(sampleLat, sampleLong, precision)   

#Sanity check: go the URL link to get the same lat/long: http://geohash.org/dr5ru4r69720

d
dr
dr5
dr5r
dr5ru
dr5ru4
dr5ru4r
dr5ru4r6
dr5ru4r69
dr5ru4r697
dr5ru4r6972
dr5ru4r69720


In [29]:
gh.decode("dr5ru4r69720")
#Notice the small errors in decoding

(40.750110978260636, -73.99389607831836)

## Check if nearby lat/long hash to the same cell

In [66]:
#check if nearby lat/long hash to the same cell

sampleLat1 = 40.750111
sampleLong1 = -73.993896

from geopy.distance import vincenty
locA = (40.750111, -73.99389)
locB = (40.750111, -73.989)
locC = (40.750111, -73.9938)
print "Distance between locA and locB in Meters: ",(vincenty(locA, locB).meters)
print "Distance between locA and locC in Meters: ",(vincenty(locA, locC).meters)

for precision in range (1,10):
    print "Precision ", precision," geocode for LocA: ", gh.encode(locA[0],locA[1], precision)
    print "Precision ", precision," geocode for LocB: ",gh.encode(locB[0],locB[1], precision)
    print "Precision ", precision," geocode for LocC: ",gh.encode(locC[0],locC[1], precision)

Distance between locA and locB in Meters:  412.970969318
Distance between locA and locC in Meters:  7.60066447988
Precision  1  geocode for LocA:  d
Precision  1  geocode for LocB:  d
Precision  1  geocode for LocC:  d
Precision  2  geocode for LocA:  dr
Precision  2  geocode for LocB:  dr
Precision  2  geocode for LocC:  dr
Precision  3  geocode for LocA:  dr5
Precision  3  geocode for LocB:  dr5
Precision  3  geocode for LocC:  dr5
Precision  4  geocode for LocA:  dr5r
Precision  4  geocode for LocB:  dr5r
Precision  4  geocode for LocC:  dr5r
Precision  5  geocode for LocA:  dr5ru
Precision  5  geocode for LocB:  dr5ru
Precision  5  geocode for LocC:  dr5ru
Precision  6  geocode for LocA:  dr5ru4
Precision  6  geocode for LocB:  dr5ru6
Precision  6  geocode for LocC:  dr5ru4
Precision  7  geocode for LocA:  dr5ru4r
Precision  7  geocode for LocB:  dr5ru66
Precision  7  geocode for LocC:  dr5ru4r
Precision  8  geocode for LocA:  dr5ru4r6
Precision  8  geocode for LocB:  dr5ru66f
Prec

## Distance between two points: Actual and Geohashed

In [69]:
locA = (40.750111, -73.99389)
locB = (42.750111, -73.989)
print "Actual distance between locA and locB in Meters: ",(vincenty(locA, locB).meters)

for precision in range (1,13):
    ghLocA = gh.decode(gh.encode(locA[0],locA[1], precision))
    ghLocB = gh.decode(gh.encode(locB[0],locB[1], precision))
    print "Precision = ",precision,", Distance based on geohash (in meters): ",(vincenty(ghLocA, ghLocB).meters)

Actual distance between locA and locB in Meters:  222137.267177
Precision =  1 , Distance based on geohash (in meters):  0.0
Precision =  2 , Distance based on geohash (in meters):  0.0
Precision =  3 , Distance based on geohash (in meters):  312365.562867
Precision =  4 , Distance based on geohash (in meters):  234284.925235
Precision =  5 , Distance based on geohash (in meters):  219642.113731
Precision =  6 , Distance based on geohash (in meters):  222084.567987
Precision =  7 , Distance based on geohash (in meters):  222082.926417
Precision =  8 , Distance based on geohash (in meters):  222140.220258
Precision =  9 , Distance based on geohash (in meters):  222135.466682
Precision =  10 , Distance based on geohash (in meters):  222137.25434
Precision =  11 , Distance based on geohash (in meters):  222137.253906
Precision =  12 , Distance based on geohash (in meters):  222137.272556


## Play with pandas (incomplete)

In [8]:
df=pd.read_csv("./tmplocaldata/yellow_tripdata_2015-01.csv"); #, header=None,

In [9]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.993896,40.750111,1,N,-73.974785,40.750618,1,12.0,1.0,0.5,3.25,0,0.3,17.05
1,1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-74.001648,40.724243,1,N,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0,0.3,17.8
2,1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.8,-73.963341,40.802788,1,N,-73.95182,40.824413,2,9.5,0.5,0.5,0.0,0,0.3,10.8
3,1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-74.009087,40.713818,1,N,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0,0.3,4.8
4,1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.0,-73.971176,40.762428,1,N,-74.004181,40.742653,2,15.0,0.5,0.5,0.0,0,0.3,16.3


In [18]:
range (1,12)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [73]:
df.dtypes

VendorID                   int64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
pickup_longitude         float64
pickup_latitude          float64
RateCodeID                 int64
store_and_fwd_flag        object
dropoff_longitude        float64
dropoff_latitude         float64
payment_type               int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
dtype: object