# Data Prep for Destination Visualization & Prediction

In [3]:
import numpy as np
import scipy as sp
import pandas as pd
from pyspark.sql import SQLContext
from pyspark.sql.types import *

import geohash

from datetime import *
from dateutil.parser import parse

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

sc

<pyspark.context.SparkContext at 0x7fe56905ded0>

In [4]:
#File needed for geohash routines
sc.addPyFile("geohash.py")

## 1. Parse the CSV and construct RDDs

In [5]:
"""
    2: pickup dattime
    6: pickup long
    7: pickup lat
    10: dropoff long
    11: dropoff lat
"""
def yCabParse(strRecord):    
    return (parse(strRecord[2]), float(strRecord[6]), float(strRecord[7]), float(strRecord[10]), float(strRecord[11]))

yCabRDD = sc.textFile("s3://testsetu/nyc/final/yellow/consolidated/pa*").map(lambda line: tuple(line.split(',')))
yCabRDD = yCabRDD.map(lambda record: yCabParse(record))

In [6]:
"""
    2: pickup dattime
    6: pickup long
    7: pickup lat
    8: dropoff long
    9: dropoff lat
"""
def gCabParse(strRecord):    
    return (parse(strRecord[2]), float(strRecord[6]), float(strRecord[7]), float(strRecord[8]), float(strRecord[9]))

gCabRDD = sc.textFile("s3://testsetu/nyc/final/green/consolidated/pa*").map(lambda line: tuple(line.split(',')))
gCabRDD = gCabRDD.map(lambda record: gCabParse(record))

In [7]:
combinedRDD = yCabRDD.union(gCabRDD)

In [8]:
"""
    2: pickup dattime
    6: pickup long
    7: pickup lat
    8: dropoff long
    9: dropoff lat
"""
def prepData(record, onlyLocationAgg = False):
    
    geohashAccuracy = 6
    minsPerBin = 48
    
    pickupDatetime = record[0] 
    pickupLong = record[1]
    pickupLat = record[2]
    dropOffLong = record[3]
    dropOffLat = record[4]
    
    if pickupLat < 50 and pickupLat > 35 and pickupLong < -50 and pickupLong > -80:
        pickupGeohash = geohash.encode(pickupLat,pickupLong, geohashAccuracy)
    else:
        return None
    
    if dropOffLat < 50 and dropOffLat > 35 and dropOffLong < -50 and dropOffLong > -80:
        dropOffGeohash = geohash.encode(dropOffLat,dropOffLong, geohashAccuracy)
    else:
        return None
    
    
    #time_cat
    d = pickupDatetime
    
    totalMinsPerDay = 1440
    totalBins = totalMinsPerDay/minsPerBin
    
    elapsMins = (d.hour)*60 + d.minute
    #minsPerBin = totalMinsPerDay/totalBins
    currentBin = elapsMins/minsPerBin
    binnedHour = d.hour #elapsMins/60
    binnedMin = (currentBin * minsPerBin)- (binnedHour * 60)
    
    binStr = ""
    
    if (binnedHour/10>0):
        binStr = str(binnedHour)
    else:
        binStr = "0"+str(binnedHour)
    
    binStr = binStr + ":"
    
    if (binnedMin/10>0):
        binStr = binStr + str(binnedMin)
    else:
        binStr = binStr + "0"+str(binnedMin)
    
    time_num = (binnedHour*60 + binnedMin + minsPerBin / 2.0)/(60*24)  
    
    #day of week
    
    
    #weekend
    if d.weekday() in [5,6]:
        weekend = 1
    else:
        weekend = 0
    
    if onlyLocationAgg is False:
        return ((pickupGeohash, dropOffGeohash,time_num,d.weekday()),1)
    else:
        return ((pickupGeohash, dropOffGeohash, weekend),1)

## 2. Create aggregations across time & locations

#### The output of this is used in the predictions notebook. Here we aggregate the data by pickup location, drop off location day of week & time slot.

In [18]:
combinedCleanRDD = combinedRDD.map(lambda record: prepData(record)).filter(lambda a: a is not None)

In [19]:
groupedRDD = combinedCleanRDD.reduceByKey(lambda a,b: a+b)

In [20]:
groupedRDD.cache()

PythonRDD[29] at RDD at PythonRDD.scala:43

In [None]:
def toCSVLine(record):
    data = [record[0][0], record[0][1], record[0][2], record[0][3], record[1]]
    return ','.join(str(d) for d in data)

csvRDD = groupedRDD.map(toCSVLine)
csvRDD.repartition(1).saveAsTextFile('s3://testsetu/nyc/final/groupbydestn/singlefile')

## 3. Aggregations only by locations

#### Here we aggregate all the data by only pickup & drop off locations

In [9]:
combinedCleanRDD = combinedRDD.map(lambda record: prepData(record, True)).filter(lambda a: a is not None)

In [10]:
groupedRDD = combinedCleanRDD.reduceByKey(lambda a,b: a+b)

In [11]:
groupedRDD.cache()
%time groupedRDD.count()

CPU times: user 148 ms, sys: 116 ms, total: 264 ms
Wall time: 32min 56s


1091619

In [12]:
groupedRDD.take(2)

[(('dr72p9', 'dr5rec', 0), 68), (('dr5rvf', 'dr5rmn', 0), 8)]

In [13]:
#((pickupGeohash, dropOffGeohash, weekend),1)
def toCSVLineOnlyLocnAgg(record):
    data = [record[0][0], record[0][1], record[0][2], record[1]]
    return ','.join(str(d) for d in data)

csvRDD = groupedRDD.map(toCSVLineOnlyLocnAgg)
csvRDD.repartition(1).saveAsTextFile('s3://testsetu/nyc/final/groupbydestn_only_locn/singlefile')

In [14]:
sc.textFile("s3://testsetu/nyc/final/groupbydestn_only_locn/singlefile/p*").map(lambda line: line).take(5)

[u'dr72jh,dr5rm3,1,5',
 u'dr5rgy,dr5rvr,1,1',
 u'dr5x1r,dr72jj,0,1',
 u'dr5ru2,dr72p9,1,11',
 u'dr5rs5,dr5ruc,1,35']

In [None]:
# Need to fix the CSV file: lat/long needs to be included instead of geohash
names = ["pickup_geohash","dropoff_geohash","weekend", "count"]
df=pd.read_csv("./tmplocaldata/final/groupbydestn_only_locn/singlefile/part-00000", header=None, names = names)

def decodegeo(geo, which):
    if len(geo) >= 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
    
def further_data_prep(df):
    df['pickup_lat'] = df['pickup_geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['pickup_long'] = df['pickup_geohash'].apply(lambda geo: decodegeo(geo, 1))
    df['dropoff_lat'] = df['dropoff_geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['dropoff_long'] = df['dropoff_geohash'].apply(lambda geo: decodegeo(geo, 1))    
    return df

df = further_data_prep(df)
df.drop('pickup_geohash', axis=1, inplace=True)
df.drop('dropoff_geohash', axis=1, inplace=True)
df = df[["pickup_lat","pickup_long", "dropoff_lat","dropoff_long","weekend","count"]]
df.to_csv("pickup_dropoff_aggregated.csv", index=False)

#### Tableau Visualization from the above CSV is avialable [here] (https://public.tableau.com/profile/publish/pickup-destination-coupling/Dashboard1#!/publish-confirm)

In [8]:
%%html
<script type='text/javascript' src='https://public.tableau.com/javascripts/api/viz_v1.js'></script><div class='tableauPlaceholder' style='width: 804px; height: 519px;'><noscript><a href='#'><img alt='Where do people go from where? ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;pi&#47;pickup-destination-coupling&#47;Dashboard1&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' width='804' height='519' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='site_root' value='' /><param name='name' value='pickup-destination-coupling&#47;Dashboard1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;pi&#47;pickup-destination-coupling&#47;Dashboard1&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='showVizHome' value='no' /><param name='showTabs' value='y' /><param name='bootstrapWhenNotified' value='true' /></object></div>

# Appendix

## Machine learning sandbox (for reference only - pls ignore)

In [None]:

from pyspark.mllib.tree import RandomForest, RandomForestModel


In [None]:
sc.textFile("s3://testsetu/nyc/final/groupbydestn/singlefile/pa*").map(lambda line: line).take(5)

In [None]:
"""
pickupGeohash, dropOffGeohash,time_num,day_of_week, count
"""
def groupedParse(strRecord):    
    return (strRecord[0], strRecord[1], float(strRecord[2]), int(strRecord[3]), int(strRecord[4]))

gpRDD = sc.textFile("s3://testsetu/nyc/final/groupbydestn/singlefile/pa*").map(lambda line: tuple(line.split(','))).map(lambda x: groupedParse(x))

In [None]:
gpRDD.cache()
gpRDD.count()
dropoffGeohashes = gpRDD.map(lambda x: x[1]).distinct().collect()
dictLength = len(dropoffGeohashes)
dropoffGeohashDict = {}
i = 0
for gh in dropoffGeohashes:
    dropoffGeohashDict[gh] = i
    i = i +1

broadcastGH = sc.broadcast(dropoffGeohashDict)

In [None]:
from pyspark.mllib.regression import LabeledPoint
import math
#Create features as labeledpoint

"""
0:pickupGeohash
1:dropOffGeohash
2:time_num
3:day_of_week
4: count
"""
def extractFeaturesforML(record):
    #np.array([1.0, 0.0, 3.0])
    
    count = record[4]
    timeCos = math.cos(record[2] * 2 * math.pi)
    timeSin = math.sin(record[2] * 2 * math.pi)
    
    #dayCos = math.cos(key[3] * 2 * math.pi)
    #daySin = math.sin(key[3] * 2 * math.pi)
    
    pickupLatLong = geohash.decode(record[0])
    pickupLat=pickupLatLong[0]
    pickupLong=pickupLatLong[1]    
    
    features_ = np.array([record[2], record[3], count, timeCos, timeSin, pickupLat, pickupLong])
    
    return LabeledPoint(broadcastGH.value[record[1]], features_)

gpRDD.map(lambda x: extractFeaturesforML(x)).take(2)
featuresLP = gpRDD.map(lambda x: extractFeaturesforML(x)).cache()
%time featuresLP.count()

In [None]:
# Sandbox code: not optimal
(trainingData, testData) = featuresLP.randomSplit([0.8, 0.2])

model = RandomForest.trainClassifier(trainingData, numClasses=dictLength, categoricalFeaturesInfo={},
                                     numTrees=10, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=10, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
#print(model.toDebugString())
