In [1]:
import numpy as np
import scipy as sp
import pandas as pd
from pyspark.sql import SQLContext
from pyspark.sql.types import *

import TaxiSparkSchema
import geohash

from datetime import *
from dateutil.parser import parse

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

sc

<pyspark.context.SparkContext at 0x7fdfb16d4e10>

## 0. Prerequsites

1. Setup the spark cluster
2. Upload (a) this notebook (b) TaxiSparkSchema.py (c) geohash.py
3. When you start pyspark, use the following command instead. This is required for loading the spark-csv framework used by data frames. This library is used in the last section for sanity checking.
>pyspark --packages com.databricks:spark-csv_2.10:1.3.0

## 1. Setup the schema and load required libraries

In [2]:
yCabSchema = TaxiSparkSchema.getYellowCabSchema()
gCabSchema = TaxiSparkSchema.getGreenCabSchema()

# of columns:  20
# of columns:  22


In [3]:
sc.addPyFile("TaxiSparkSchema.py")
sc.addPyFile("geohash.py")

In [4]:
sqlContext = SQLContext(sc)
yCabDF = sqlContext.read.format('com.databricks.spark.csv').options( mode="PERMISSIVE", header='false').load('s3://testsetu/nyc/final/yellow/consolidated/part*', schema = yCabSchema)
gCabDF = sqlContext.read.format('com.databricks.spark.csv').options( mode="PERMISSIVE", header='false').load('s3://testsetu/nyc/final/green/consolidated/pa*', schema = gCabSchema)
#gCabDF = sqlContext.read.format('com.databricks.spark.csv').options( mode="PERMISSIVE", header='false').load('s3://testsetu/nyc/final/green/consolidated/pa*', schema = gCabSchema)

## 2. Some basic cleanup

There seems to be 325 records in the Yellow Cab data with **incorrect latitud**. the below code filters them out.

In [26]:
yCabDF = yCabDF.filter((yCabDF.pickup_latitude >-90.0) & (yCabDF.pickup_latitude < 90.0))

In [22]:
totalRecordsYellowCab = 407403053
print "# dirty latitude records: ", totalRecordsYellowCab - yFil.count()

# dirty latitude recoerds:  325


## 3. Checking out the data structure

In [11]:
yCabDF.printSchema()

root
 |-- cab_company: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: float (nullable = true)
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)
 |-- rate_code_id: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: float (nullable = true)
 |-- dropoff_latitude: float (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- total_amount: float (nullable = true)



In [23]:
#Number of Yellow cab records
%time yCabDF.count()

CPU times: user 20 ms, sys: 12 ms, total: 32 ms
Wall time: 5min 10s


407403053

In [5]:
#Number of Green cab records
%time print "Count of # records from data frame: ", gCabDF.count()

Count of # records from data frame:  26869879
CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 32.5 s


## 4. Checking out the GeoSpatial data

In [9]:
precision = 6
gCabPickupGeoHash = gCabDF.select("pickup_latitude", "pickup_longitude").map(lambda latLong: geohash.encode(latLong[0], latLong[1], precision))
%time print gCabPickupGeoHash.distinct().count()

3968
CPU times: user 28 ms, sys: 4 ms, total: 32 ms
Wall time: 1min 2s


In [27]:
precision = 6
yCabPickupGeoHash = yCabDF.select("pickup_latitude", "pickup_longitude").map(lambda latLong: geohash.encode(latLong[0], latLong[1], precision))
%time print yCabPickupGeoHash.distinct().count()

30365
CPU times: user 120 ms, sys: 64 ms, total: 184 ms
Wall time: 14min 13s


## 5. Feature Extraction

In [33]:
#Add extra column: geohash
#gCabDF.withColumn("pickup_geohash", geohash.encode(col(gCabDF.pickup_latitude, col(gCabDF.pickup_longitude), precision))
#gCabDF.map(lambda record: )
