In [2]:
import os
from collections import namedtuple
data_path = os.environ["YAHOO_DATA"]
rawEventsRdd = sc.textFile(data_path + "filteredEvents.txt")
EventDataRow = namedtuple("EventDataRow", ["userId", "itemId", "ts", "latitude", "longitude", "city", "day_of_week", "time_of_day" , "event_type"])
def parseRawData(line):
    lineSplit = line.split("\t")
    return EventDataRow(userId=lineSplit[0],
                      itemId=lineSplit[1],
                      ts=int(lineSplit[2]),
                      latitude=float(lineSplit[3]),
                      longitude=float(lineSplit[4]),
                      city=lineSplit[5],
                      day_of_week=int(lineSplit[6]),
                      time_of_day=int(lineSplit[7]),
                      event_type=lineSplit[-1],
    )
eventsRdd = rawEventsRdd.map(parseRawData).cache()
userIdConversionDictionary = eventsRdd.map(lambda x: x.userId).distinct().zipWithIndex().collectAsMap()
userIdConversionDictionaryBroadcast = sc.broadcast(userIdConversionDictionary)
itemIdConversionDictionary = eventsRdd.map(lambda x: x.itemId).distinct().zipWithIndex().collectAsMap()
itemIdConversionDictionaryBroadcast = sc.broadcast(itemIdConversionDictionary)
cityConversionDictionary = eventsRdd.map(lambda x: x.city).distinct().zipWithIndex().collectAsMap()
cityConversionDictionaryBroadcast = sc.broadcast(cityConversionDictionary)

eventsConvertedRdd = eventsRdd.map(lambda x: EventDataRow(
    userId=userIdConversionDictionaryBroadcast.value[x.userId],
    itemId=itemIdConversionDictionaryBroadcast.value[x.itemId],
    ts=x.ts,
    latitude=x.latitude,
    longitude=x.longitude,
    city=cityConversionDictionaryBroadcast.value[x.city],
    day_of_week=x.day_of_week,
    time_of_day=x.time_of_day,
    event_type=x.event_type
    ))
eventsConvertedRdd.take(2)

[EventDataRow(userId=1, itemId=41, ts=1421521691, latitude=47.23505, longitude=-122.534698, city=9, day_of_week=5, time_of_day=11, event_type=u'App_Opened'),
 EventDataRow(userId=1, itemId=46, ts=1421558502, latitude=47.23505, longitude=-122.534698, city=9, day_of_week=5, time_of_day=21, event_type=u'App_Opened')]

In [3]:
finalRDD = eventsConvertedRdd.map(lambda x: [
    x.userId,(
    x.itemId,
    x.ts,
    x.latitude,
    x.longitude,)
    ])
groupData = map((lambda (x,y): (x, sorted(list(y),key=lambda a: a[1]))), sorted(finalRDD.groupByKey().collect()))

In [5]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r
def detectMovement(x):
    data = x[1]
    newData = [(data[0][0], data[0][1], data[0][2], data[0][3], 0)]
    for i in xrange(1,len(data)):
        event = data[i]
        distance = haversine(event[3],event[2], data[i-1][3], data[i-1][2]) * 1000 #in meters
        time_difference = event[1] - newData[i-1][1] #in seconds
        moving = 0 #not available 
        if time_difference <= 300: #if 2 consecutive events are more than 300 seconds away, the movement is not available
            velocity =  distance/time_difference if time_difference > 0 else -1
            if velocity < 0:
                moving = 0; #not available
            elif velocity >= 0 and velocity <= 1:
                moving = 1  #standing still
            elif velocity <=2.4:
                moving = 2 #walking spead
            else:
                moving = 3 #faster
        newData.append((event[0],event[1],event[2],event[3], moving))
    return (x[0], newData)
    #return x
#print haversine(elem[0][1][2][1],elem[0][1][1][1],elem[6][1][2][1],elem[6][1][1][1])
final = sc.parallelize(groupData).map(detectMovement).cache()

final.take(1)

[(0,
  [(9, 1419930330, 41.953468, -88.03688, 0),
   (26, 1419933628, 41.953377, -88.036888, 0),
   (26, 1419933925, 41.95377, -88.03685, 1),
   (7, 1419936235, 41.95089, -88.031929, 0),
   (32, 1419936281, 41.95089, -88.031929, 1),
   (7, 1419936795, 41.95372, -88.036819, 0),
   (7, 1419936826, 41.95372, -88.036819, 1),
   (25, 1419939212, 41.953773, -88.03685, 0),
   (36, 1419939665, 41.95377, -88.036842, 0),
   (32, 1419943830, 41.953518, -88.036728, 0),
   (25, 1419944425, 41.953758, -88.036797, 0),
   (18, 1419944495, 41.953781, -88.036812, 1),
   (32, 1419945131, 41.953392, -88.036758, 0),
   (7, 1419945747, 41.952774, -88.036896, 0),
   (32, 1419945766, 41.952774, -88.036896, 1),
   (29, 1419946786, 41.953533, -88.036667, 0),
   (41, 1419946886, 41.953434, -88.03688, 1),
   (25, 1419947132, 41.953484, -88.036652, 1),
   (25, 1419947331, 41.953777, -88.036812, 1),
   (41, 1419947368, 41.953491, -88.036858, 1),
   (2, 1419947404, 41.953674, -88.036758, 1),
   (32, 1419947827, 41.9

In [3]:
execfile("../script/context.py")
final.take(2)

[(0,
  [(9, 1, 1, 1),
   (26, 1, 3, 1),
   (26, 2, 3, 1),
   (7, 1, 3, 1),
   (32, 2, 3, 1),
   (7, 1, 3, 1),
   (7, 2, 3, 1),
   (25, 1, 3, 1),
   (36, 1, 3, 1),
   (32, 1, 3, 1),
   (25, 1, 3, 2),
   (18, 2, 3, 2),
   (32, 1, 3, 2),
   (7, 1, 3, 2),
   (32, 2, 3, 2),
   (29, 1, 3, 2),
   (41, 2, 1, 2),
   (25, 2, 3, 2),
   (25, 2, 3, 2),
   (41, 2, 1, 2),
   (2, 2, 3, 2),
   (32, 1, 3, 2),
   (44, 1, 3, 2),
   (32, 2, 3, 2),
   (44, 2, 3, 2),
   (20, 1, 2, 3),
   (26, 2, 2, 3),
   (32, 2, 1, 3),
   (25, 2, 3, 3),
   (29, 1, 1, 3),
   (32, 1, 1, 3),
   (25, 1, 3, 3),
   (32, 1, 3, 3),
   (32, 1, 3, 3),
   (22, 2, 3, 3),
   (26, 1, 2, 3),
   (41, 1, 3, 3),
   (4, 1, 3, 1),
   (32, 2, 3, 1),
   (25, 2, 3, 1),
   (32, 2, 3, 1),
   (25, 1, 3, 1),
   (32, 2, 3, 1),
   (32, 1, 3, 1),
   (7, 1, 3, 1),
   (13, 2, 3, 1),
   (32, 1, 3, 1),
   (7, 1, 3, 2),
   (9, 2, 3, 2),
   (32, 2, 3, 2),
   (9, 1, 1, 2),
   (25, 1, 3, 2),
   (25, 1, 3, 2),
   (32, 1, 3, 3),
   (15, 1, 1, 3),
   (32, 2, 1, 3)