In [ ]:
val data = sparkSession.read.csv("/home/maasg/Data/T-Drive/release/taxi_log_2008_by_id/1.txt").toDF("id", "ts","lon","lat")

data: org.apache.spark.sql.DataFrame = [id: string, ts: string ... 2 more fields]


In [ ]:
data.show(5)

+---+-------------------+---------+--------+
| id|                 ts|      lon|     lat|
+---+-------------------+---------+--------+
|  1|2008-02-02 15:36:08|116.51172|39.92123|
|  1|2008-02-02 15:46:08|116.51135|39.93883|
|  1|2008-02-02 15:46:08|116.51135|39.93883|
|  1|2008-02-02 15:56:08|116.51627|39.91034|
|  1|2008-02-02 16:06:08|116.47186|39.91248|
+---+-------------------+---------+--------+
only showing top 5 rows



In [ ]:
data.count

res4: Long = 588


In [ ]:
val formattedData = data.select($"id", $"ts".cast("Timestamp").as("timestamp"), 
                                struct($"lat".cast("Double").as("lat"),$"lon".cast("Double").as("lon")).as("location"))

formattedData: org.apache.spark.sql.DataFrame = [id: string, timestamp: timestamp ... 1 more field]


In [ ]:
formattedData.printSchema

root
 |-- id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- location: struct (nullable = false)
 |    |-- lat: double (nullable = true)
 |    |-- lon: double (nullable = true)



In [ ]:
case class Coordinates(lat:Double, lon: Double) {
  val R = 6378.137 
  def distanceTo(other: Coordinates): Double = {
    def sqr(x: Double): Double = Math.pow(x, 2.0)
    def rad(x: Double): Double = x *  Math.PI / 180     
    val deltaLat = rad(other.lat) - rad(this.lat)
    val deltaLon = rad(other.lon) - rad(this.lon)
    val a = sqr(Math.sin(deltaLat/2)) + Math.cos(rad(this.lat)) * Math.cos(rad(other.lat)) *
      sqr(Math.sin(deltaLon/2))
    val c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1-a))
    R * c * 1000// result in meters
  }
}

defined class Coordinates


In [ ]:
import java.sql.Timestamp
case class CarLocation(id:String, timestamp: Timestamp, location: Coordinates)

import java.sql.Timestamp
defined class CarLocation


In [ ]:
import java.sql.Timestamp
case class TimeLocation(time: Timestamp, location: Coordinates) {
  def speedTo(other: TimeLocation): Double = {
    if (other.time == this.time) 0.0 else {
      val distance = location.distanceTo(other.location) // mts
      val deltaT = Math.abs(time.getTime - other.time.getTime) / 1000 // seconds
      distance/deltaT * 3.6 // km/h
    }
  }
}

import java.sql.Timestamp
defined class TimeLocation


In [ ]:
val carLocationData = formattedData.as[CarLocation]

carLocationData: org.apache.spark.sql.Dataset[CarLocation] = [id: string, timestamp: timestamp ... 1 more field]


In [ ]:
val timeCarLocationData = carLocationData.orderBy("timestamp")

timeCarLocationData: org.apache.spark.sql.Dataset[CarLocation] = [id: string, timestamp: timestamp ... 1 more field]


In [ ]:
timeCarLocationData.show(5)

+---+-------------------+--------------------+
| id|          timestamp|            location|
+---+-------------------+--------------------+
|  1|2008-02-02 15:36:08|[39.92123,116.51172]|
|  1|2008-02-02 15:46:08|[39.93883,116.51135]|
|  1|2008-02-02 15:46:08|[39.93883,116.51135]|
|  1|2008-02-02 15:56:08|[39.91034,116.51627]|
|  1|2008-02-02 16:06:08|[39.91248,116.47186]|
+---+-------------------+--------------------+
only showing top 5 rows



In [ ]:
trait CarState

defined trait CarState


In [ ]:
object CarStates {
  case object Unknown extends CarState
  case object Driving extends CarState
  case object Stopped extends CarState
}

defined object CarStates


In [ ]:
case class TripState(id: String, 
                     start: TimeLocation, 
                     current: TimeLocation, 
                     state: CarState = CarStates.Unknown,
                     end: Option[TimeLocation] = None
                    )

defined class TripState


In [ ]:
val bxl = Coordinates(50.8503, 4.3517)
val antwerp = Coordinates(51.2194,4.4025)

bxl: Coordinates = Coordinates(50.8503,4.3517)
antwerp: Coordinates = Coordinates(51.2194,4.4025)


In [ ]:
bxl.distanceTo(antwerp)

res21: Double = 41241.62572838361


In [ ]:
val now = System.currentTimeMillis
val bxlPickup = TimeLocation(new Timestamp(now), bxl)
val waitInbxlPickup = TimeLocation(new Timestamp(now+11*60*1000), bxl)
val antwerpDrop = TimeLocation(new Timestamp(now + 30*60*1000), antwerp)

now: Long = 1530833439311
bxlPickup: TimeLocation = TimeLocation(2018-07-06 01:30:39.311,Coordinates(50.8503,4.3517))
waitInbxlPickup: TimeLocation = TimeLocation(2018-07-06 01:41:39.311,Coordinates(50.8503,4.3517))
antwerpDrop: TimeLocation = TimeLocation(2018-07-06 02:00:39.311,Coordinates(51.2194,4.4025))


In [ ]:
val adrop = antwerpDrop.speedTo(bxlPickup) 

adrop: Double = 82.48325145676722


In [ ]:
// We define that a trip ended when the car stops for longer than 10 minutes 
def stoppedCriteria(tl1: TimeLocation, tl2: TimeLocation): Boolean = {
  val tenMinutesInMs = 10 * 60 * 1000
  val isNotMoving = tl1.speedTo(tl2) < 1  // We cannot compare exact locations to allow for GPS fluctuations 
  val elapsedTime = Math.abs(tl2.time.getTime - tl1.time.getTime) 
  isNotMoving && elapsedTime > tenMinutesInMs
}

stoppedCriteria: (tl1: TimeLocation, tl2: TimeLocation)Boolean


In [ ]:
stoppedCriteria(bxlPickup, waitInbxlPickup)

res28: Boolean = true


In [ ]:
def updateTripStateWithCarLocation(state: TripState, event: CarLocation): TripState = {
  import CarStates._
    state.current.time.compareTo(event.timestamp) match {
    case 0 =>  // same timestamp => probably duplicated record. Do nothing: return same state
      state
    case -1 => // state.currentTime < event.timestamp => check update of start time
      if (state.start.time.before(event.timestamp)) {
        state.copy(start = TimeLocation(event.timestamp, event.location))
      } else {
        state
      }
    case 1 => // state.current.time > event.timestamp => update state
      val currentTimeLocation = TimeLocation(event.timestamp, event.location)
      val isMoving = state.current.speedTo(currentTimeLocation) > 1 // km/h
      (isMoving, state.state) match {
        case (true, Unknown) => state.copy(state= Driving, current = currentTimeLocation)
        case (false, Unknown) => state.copy(state= Stopped, current = currentTimeLocation)
        case (true, Driving )=> state.copy(current= currentTimeLocation)
        case (false, Driving) => state.copy(state= Stopped)
        case (true, Stopped) => state.copy(state= Driving, current = currentTimeLocation)
        case (false, Stopped) => state.copy(end = Some(state.current))
      }
  }
}

It would fail on the following inputs: (false, _), (true, _)
             (isMoving, state.state) match {
             ^
updateTripStateWithCarLocation: (state: TripState, event: CarLocation)TripState


In [ ]:
import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, GroupState}

def processTripsByCarId(id: String, inputs: Iterator[CarLocation], prevState: GroupState[TripState]): Iterator[TripState] = {
  val updateFunction: (TripState, CarLocation) => TripState = {case (tripState, carLocation) => updateTripStateWithCarLocation(tripState, carLocation)}
  val state = if (prevState.exists) {
    inputs.foldLeft(prevState.get)(updateFunction)
  } else {
    val head = inputs.next
    val initialTimeLocation = TimeLocation(head.timestamp, head.location)
    val initialState = TripState(id, initialTimeLocation, initialTimeLocation)
    inputs.foldLeft(initialState)(updateFunction)
  }
  
  if (state.end.isDefined) { 
    Seq(state).toIterator
  } else {
    Seq.empty[TripState].toIterator
  }
  
}

import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, GroupState}
processTripsByCarId: (id: String, inputs: Iterator[CarLocation], prevState: org.apache.spark.sql.streaming.GroupState[TripState])Iterator[TripState]


## Create a Stream using the loaded Car Data

In [ ]:
val indexedData = timeCarLocationData.withColumn("index", monotonically_increasing_id())

indexedData: org.apache.spark.sql.DataFrame = [id: string, timestamp: timestamp ... 2 more fields]


In [ ]:
indexedData.show(10)

+---+-------------------+--------------------+-----+
| id|          timestamp|            location|index|
+---+-------------------+--------------------+-----+
|  1|2008-02-02 15:36:08|[39.92123,116.51172]|    0|
|  1|2008-02-02 15:46:08|[39.93883,116.51135]|    1|
|  1|2008-02-02 15:46:08|[39.93883,116.51135]|    2|
|  1|2008-02-02 15:56:08|[39.91034,116.51627]|    3|
|  1|2008-02-02 16:06:08|[39.91248,116.47186]|    4|
|  1|2008-02-02 16:16:08|[39.92498,116.47217]|    5|
|  1|2008-02-02 16:26:08|[39.90718,116.47179]|    6|
|  1|2008-02-02 16:36:08|[39.90531,116.45617]|    7|
|  1|2008-02-02 17:00:24|[39.90577,116.47191]|    8|
|  1|2008-02-02 17:10:24| [39.9145,116.50661]|    9|
+---+-------------------+--------------------+-----+
only showing top 10 rows



In [ ]:
// generate a rate stream of 1 row per second. Keep only the 'value' field
val stream = sparkSession.readStream.format("rate").option("rowsPerSecond", 10).load().select($"value") 

stream: org.apache.spark.sql.DataFrame = [value: bigint]


In [ ]:
// We create the stream by joining a portion of the static dataset with the `Rate` Stream
val carDataStream = stream.join(indexedData, $"value" === $"index").select($"id", $"timestamp", $"location").as[CarLocation] 


carDataStream: org.apache.spark.sql.Dataset[CarLocation] = [id: string, timestamp: timestamp ... 1 more field]


In [ ]:
carDataStream.printSchema

root
 |-- id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- location: struct (nullable = false)
 |    |-- lat: double (nullable = true)
 |    |-- lon: double (nullable = true)



In [ ]:
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, GroupState}
val tripStream = carDataStream.groupByKey(carLocation => carLocation.id)
.flatMapGroupsWithState(OutputMode.Append,GroupStateTimeout.ProcessingTimeTimeout)(processTripsByCarId)

import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode, GroupState}
tripStream: org.apache.spark.sql.Dataset[TripState] = [id: string, start: struct<time: timestamp, location: struct<lat: double, lon: double>> ... 2 more fields]


In [ ]:
val query = tripStream.writeStream.format("memory").queryName("trips").start

query: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@6d5ef901


In [ ]:
val table = sparkSession.sql("select * from trips")


table: org.apache.spark.sql.DataFrame = [id: string, start: struct<time: timestamp, location: struct<lat: double, lon: double>> ... 2 more fields]


In [ ]:
table.show()

+---+-----+-------+---+
| id|start|current|end|
+---+-----+-------+---+
+---+-----+-------+---+



In [ ]:
query.stop()

In [ ]:
query.lastProgress


res58: org.apache.spark.sql.streaming.StreamingQueryProgress =
{
  "id" : "54f78dc4-f77f-4c28-a863-7bbef3073f7f",
  "runId" : "5d9b3307-f895-46da-ab86-89cb6ced628d",
  "name" : "trips",
  "timestamp" : "2018-07-05T19:37:28.066Z",
  "numInputRows" : 10,
  "inputRowsPerSecond" : 10.40582726326743,
  "processedRowsPerSecond" : 10.40582726326743,
  "durationMs" : {
    "addBatch" : 898,
    "getBatch" : 4,
    "getOffset" : 0,
    "queryPlanning" : 47,
    "triggerExecution" : 961,
    "walCommit" : 9
  },
  "stateOperators" : [ {
    "numRowsTotal" : 0,
    "numRowsUpdated" : 0
  } ],
  "sources" : [ {
    "description" : "RateSource[rowsPerSecond=10, rampUpTimeSeconds=0, numPartitions=8]",
    "startOffset" : 52,
    "endOffset" : 53,
    "numInputRows" : 10,
    "inputRowsPerSecond" : 10...