In [1]:
import numpy as np
import scipy as sp
import pandas as pd
from pyspark.sql import SQLContext
from pyspark.sql.types import *

import geohash

from datetime import *
from dateutil.parser import parse

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

sc

<pyspark.context.SparkContext at 0x7fca81b70490>

In [2]:
sc.addPyFile("geohash.py")

In [3]:
"""
    2: pickup dattime
    6: pickup long
    7: pickup lat
    10: dropoff long
    11: dropoff lat
"""
def yCabParse(strRecord):    
    return (parse(strRecord[2]), float(strRecord[6]), float(strRecord[7]), float(strRecord[10]), float(strRecord[11]))

yCabRDD = sc.textFile("s3://testsetu/nyc/final/yellow/consolidated/pa*").map(lambda line: tuple(line.split(',')))
yCabRDD = yCabRDD.map(lambda record: yCabParse(record))

In [4]:
"""
    2: pickup dattime
    6: pickup long
    7: pickup lat
    8: dropoff long
    9: dropoff lat
"""
def gCabParse(strRecord):    
    return (parse(strRecord[2]), float(strRecord[6]), float(strRecord[7]), float(strRecord[8]), float(strRecord[9]))

gCabRDD = sc.textFile("s3://testsetu/nyc/final/green/consolidated/pa*").map(lambda line: tuple(line.split(',')))
gCabRDD = gCabRDD.map(lambda record: gCabParse(record))

In [5]:
combinedRDD = yCabRDD.union(gCabRDD)

In [11]:
"""
    2: pickup dattime
    6: pickup long
    7: pickup lat
    8: dropoff long
    9: dropoff lat
"""
def prepData(record):
    
    geohashAccuracy = 6
    minsPerBin = 48
    
    pickupDatetime = record[0] 
    pickupLong = record[1]
    pickupLat = record[2]
    dropOffLong = record[3]
    dropOffLat = record[4]
    
    if pickupLat < 50 and pickupLat > 35 and pickupLong < -50 and pickupLong > -80:
        pickupGeohash = geohash.encode(pickupLat,pickupLong, geohashAccuracy)
    else:
        return None
    
    if dropOffLat < 50 and dropOffLat > 35 and dropOffLong < -50 and dropOffLong > -80:
        dropOffGeohash = geohash.encode(dropOffLat,dropOffLong, geohashAccuracy)
    else:
        return None
    
    
    #time_cat
    d = pickupDatetime
    
    totalMinsPerDay = 1440
    totalBins = totalMinsPerDay/minsPerBin
    
    elapsMins = (d.hour)*60 + d.minute
    #minsPerBin = totalMinsPerDay/totalBins
    currentBin = elapsMins/minsPerBin
    binnedHour = d.hour #elapsMins/60
    binnedMin = (currentBin * minsPerBin)- (binnedHour * 60)
    
    binStr = ""
    
    if (binnedHour/10>0):
        binStr = str(binnedHour)
    else:
        binStr = "0"+str(binnedHour)
    
    binStr = binStr + ":"
    
    if (binnedMin/10>0):
        binStr = binStr + str(binnedMin)
    else:
        binStr = binStr + "0"+str(binnedMin)
    
    time_num = (binnedHour*60 + binnedMin + minsPerBin / 2.0)/(60*24)  
    
    #day of week
    dayStr = {0: "Mon",
                  1: "Tue",
                  2: "Wed",
                  3: "Thu",
                  4: "Fri",
                  5: "Sat",
                  6: "Sun"}
    day_of_week = dayStr[d.weekday()]
    
    #weekend
    """if d.weekday() in [5,6]:
        weekend = 1
    else:
        weekend = 0"""
    
    return ((pickupGeohash, dropOffGeohash,time_num,d.weekday()),1)

In [12]:
combinedCleanRDD = combinedRDD.map(lambda record: prepData(record)).filter(lambda a: a is not None)

In [13]:
groupedRDD = combinedCleanRDD.reduceByKey(lambda a,b: a+b)

In [11]:
groupedRDD.cache()

PythonRDD[17] at RDD at PythonRDD.scala:43

In [None]:
%time groupedRDD.count()

In [None]:
groupedRDD.take(2)

In [14]:
def toCSVLine(record):
    data = [record[0][0], record[0][1], record[0][2], record[0][3], record[1]]
    return ','.join(str(d) for d in data)

csvRDD = groupedRDD.map(toCSVLine)
csvRDD.repartition(1).saveAsTextFile('s3://testsetu/nyc/final/groupbydestn/singlefile')

## Feature extraction

In [15]:
sc.textFile("s3://testsetu/nyc/final/groupbydestn/singlefile/pa*").map(lambda line: line).take(5)

[u'dr5rtk,dr5rsj,0.416666666667,4,2',
 u'dr5x0z,dr5xcf,0.683333333333,0,2',
 u'dr72rd,dr782h,0.65,3,2',
 u'dr5rvn,dr5rt5,0.0833333333333,2,11',
 u'dr72jd,dr72m3,0.05,0,3']

In [25]:
"""
pickupGeohash, dropOffGeohash,time_num,day_of_week, count
"""
def groupedParse(strRecord):    
    return (strRecord[0], strRecord[1], float(strRecord[2]), int(strRecord[3]), int(strRecord[4]))

gpRDD = sc.textFile("s3://testsetu/nyc/final/groupbydestn/singlefile/pa*").map(lambda line: tuple(line.split(','))).map(lambda x: groupedParse(x))

In [26]:
gpRDD.take(2)

[(u'dr5rtk', u'dr5rsj', 0.416666666667, 4, 2),
 (u'dr5x0z', u'dr5xcf', 0.683333333333, 0, 2)]

In [27]:
gpRDD.cache()
gpRDD.count()

15285988

In [22]:
dropoffGeohashes = gpRDD.map(lambda x: x[1]).distinct().collect()

In [23]:
dictLength = len(dropoffGeohashes)

In [28]:
dropoffGeohashDict = {}
i = 0
for gh in dropoffGeohashes:
    dropoffGeohashDict[gh] = i
    i = i +1

broadcastGH = sc.broadcast(dropoffGeohashDict)

In [35]:
from pyspark.mllib.regression import LabeledPoint
import math
#Create features as labeledpoint

"""
0:pickupGeohash
1:dropOffGeohash
2:time_num
3:day_of_week
4: count
"""
def extractFeaturesforML(record):
    #np.array([1.0, 0.0, 3.0])
    
    count = record[4]
    timeCos = math.cos(record[2] * 2 * math.pi)
    timeSin = math.sin(record[2] * 2 * math.pi)
    
    #dayCos = math.cos(key[3] * 2 * math.pi)
    #daySin = math.sin(key[3] * 2 * math.pi)
    
    pickupLatLong = geohash.decode(record[0])
    pickupLat=pickupLatLong[0]
    pickupLong=pickupLatLong[1]    
    
    features_ = np.array([record[2], record[3], count, timeCos, timeSin, pickupLat, pickupLong])
    
    return LabeledPoint(broadcastGH.value[record[1]], features_)

In [36]:
gpRDD.map(lambda x: extractFeaturesforML(x)).take(2)

[LabeledPoint(26847.0, [0.416666666667,4.0,2.0,-0.866025403785,0.499999999998,40.7180786133,-73.9434814453]),
 LabeledPoint(7859.0, [0.683333333333,0.0,2.0,-0.406736643078,-0.913545457642,40.6466674805,-73.7896728516])]

In [37]:
featuresLP = gpRDD.map(lambda x: extractFeaturesforML(x)).cache()
%time featuresLP.count()

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 1min 31s


15285988

## Machine learning

In [38]:

from pyspark.mllib.tree import RandomForest, RandomForestModel


In [39]:
(trainingData, testData) = featuresLP.randomSplit([0.8, 0.2])

model = RandomForest.trainClassifier(trainingData, numClasses=dictLength, categoricalFeaturesInfo={},
                                     numTrees=10, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=10, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
#print(model.toDebugString())


Test Error = 0.98894831021
Learned classification forest model:
