### This project parses 3 TB nested Json file into csv using pyspark along with sparksql for further analysis

# Attention !! All the data used is test data for only partial data from Dec 30,2015. For complete version, please check other files

# Read Json, parse

## Import dependencies

In [1]:
import pyspark
from pyspark.sql import SQLContext
from scipy.interpolate import interp1d # import dependency

In [2]:
sc
sqlContext = SQLContext(sc)

## Read JSON File

In [61]:
bus_file='test.jsons'
bus = sqlContext.read.json(bus_file)
bus.registerTempTable("bus")

## Show Schema

In [None]:
bus.printSchema()

## load and apply SQL Query

In [63]:
with open("spark_extract.sql") as fr:
     query = fr.read()
output = sqlContext.sql(query)

## Flatten the list

### Method A

In [None]:
import itertools
def extract(parts):
    for p in parts:
        for o in itertools.izip(p.Line,p.Latitude,p.Longitude,p.RecordedAtTime,p.vehicleID,p.Trip,p.TripDate):
            yield o

### Method B

In [64]:
def parse_list(p):
    if p.ROUTE_ID!=None:
        return zip(p.ROUTE_ID,p.latitude,p.longitude,p.recorded_time\
                   ,p.vehicle_id,p.TRIP_ID,p.tripdate,p.SHAPE_ID\
                   ,p.STOP_ID,p.distance_stop,p.distance_shape,p.status,p.destination)
    else:
        return []

## Tranfer time to Unix time for interpolatation

In [80]:
import time
import dateutil.parser
def unix_time(x):
    dt = dateutil.parser.parse(x)
    return time.mktime(dt.timetuple())

## Interpolate function

## Method A

In [116]:
def findIncreasingList(parts):
    prev = 0
    for record in parts:
        if record[-1]<prev:
            return
        prev = record[-1]
        yield record

In [238]:
from scipy.interpolate import interp1d # import dependency
def predict(x):
    pre_x = [p[-1] for p in x if p[-1]!=None]
    if len(pre_x) >= 2:
        pre_y = [unix_time(p[1]) for p in x if p[-1]!=None]
        f = interp1d(pre_x, pre_y)
    else:
        return []
    return findIncreasingList([(p[0],p[2],p[3],f(p[-1]+p[-2]))\
                               for p in x if p[-1]!=None and (p[-1]+p[-2]) <= pre_x[-1]])

## method b

In [None]:
from scipy.interpolate import interp1d
def predict_map(x):
    train_y = [unix_time(p[3]) for p in x if p[-3]!=None ]
    if len(train_y) >= 2:
        train_x = [p[-3] for p in x if p[-3]!=None]
        f = interp1d(train_x, train_y)
        distance = [(p[-3]+p[-4]) for p in x \
                    if p[-3]!=None and (p[-3]+p[-4]) <= train_x[-1]]
        stoptimes = f(distance)
        stops = [p[-5] for p in x if p[-3]!=None]
    else:
        return[]
    return map(lambda a,b: (a,b), stops,stoptimes)
    #return [(p[-4],f(p[-2]+p[-3])) for p in x if (p[-2]!=None and p[-3]!=None) and (p[-2]+p[-3]) <= pre_x[-1]] 

## Groupby Date and Line & Apply Interpolation

## Simple Extraction

In [None]:
output.flatMap(parse_list)\
      .take(2)

## Group By TRIP_ID and Date

In [105]:
output.flatMap(parse_list)\
      .map(lambda x:((x[5],x[6]),x)).groupByKey()\
      .map(lambda x: x[1])\
      .take(10)

[<pyspark.resultiterable.ResultIterable at 0x112c96d10>,
 <pyspark.resultiterable.ResultIterable at 0x112c96c90>,
 <pyspark.resultiterable.ResultIterable at 0x112c96150>,
 <pyspark.resultiterable.ResultIterable at 0x112c96a10>,
 <pyspark.resultiterable.ResultIterable at 0x112c96f90>,
 <pyspark.resultiterable.ResultIterable at 0x112c96e10>,
 <pyspark.resultiterable.ResultIterable at 0x112c95a10>,
 <pyspark.resultiterable.ResultIterable at 0x112c959d0>,
 <pyspark.resultiterable.ResultIterable at 0x112c95c90>,
 <pyspark.resultiterable.ResultIterable at 0x112c95050>]

## Groupbykey and Apply Interpolation

In [237]:
output.flatMap(parse_list)\
      .map(lambda x:((x[5],x[6]),(x[0],x[3],x[5],x[8],x[-4],x[-3])))\
      .groupByKey()\
      .flatMap(lambda x: predict(x[1]))\
      .take(10)

[(u'MTA NYCT_Q46',
  u'MTA NYCT_QV_W5-Weekday-110900_Q46_6',
  u'MTA_502327',
  array(1451520016.346892)),
 (u'MTA NYCT_Q46',
  u'MTA NYCT_QV_W5-Weekday-110900_Q46_6',
  u'MTA_502331',
  array(1451520119.0)),
 (u'MTA NYCT_Q20A',
  u'MTA NYCT_CS_W5-Weekday-113900_MISC_769',
  u'MTA_505023',
  array(1451520122.033537)),
 (u'MTA NYCT_Q20A',
  u'MTA NYCT_CS_W5-Weekday-113900_MISC_769',
  u'MTA_505024',
  array(1451520122.0558827)),
 (u'MTA NYCT_Q20A',
  u'MTA NYCT_CS_W5-Weekday-113900_MISC_769',
  u'MTA_505026',
  array(1451520338.3900702)),
 (u'MTA NYCT_BX8',
  u'MTA NYCT_WF_W5-Weekday-110800_BX8_13',
  u'MTA_100947',
  array(1451520014.054865)),
 (u'MTA NYCT_BX8',
  u'MTA NYCT_WF_W5-Weekday-110800_BX8_13',
  u'MTA_103181',
  array(1451520125.8062081)),
 (u'MTA NYCT_BX8',
  u'MTA NYCT_WF_W5-Weekday-110800_BX8_13',
  u'MTA_103183',
  array(1451520377.7902856)),
 (u'MTA NYCT_BX10',
  u'MTA NYCT_KB_W5-Weekday-111000_BX10_32',
  u'MTA_103960',
  array(1451519992.9190516)),
 (u'MTA NYCT_BX10',

## Remove the prefix, timezones and save as CSV

In [None]:
output.flatMap(parse_list)\
      .map(lambda x:((x[5],x[6]),(x[0],)).groupByKey() \
      .flatMap(lambda x: predict(x[1]))\
      .map(lambda x: ",".join(map(str, x)))\
      .map(lambda x: x.replace('MTA NYCT_', '').replace('MTABC_','').replace('MTA_','').replace('-05:00',''))\
      .saveAsTextFile('stoptimes')

# Read From CSV and SQL Manupilation

## Import dependencies

In [3]:
from pyspark.sql.types import *

## Reset Schemas and Indexing

In [272]:
customSchema = StructType([StructField("ROUTE_ID", StringType(), True),\
                           StructField("latitude", DoubleType(), True),\
                           StructField("longitude", DoubleType(), True),\
                           StructField("recorded_time", StringType(), True),\
                           StructField("vehicle_id", StringType(), True),\
                           StructField("TRIP_ID", StringType(), True),\
                           StructField("tripdate", DateType(), True),\
                           StructField("SHAPE_ID", StringType(), True),\
                           StructField("STOP_ID", StringType(), True),\
                           StructField("distance_stop", StringType(), True),\
                           StructField("distance_shape", StringType(), True),\
                           StructField("status", StringType(), True),\
                           StructField("destination", StringType(), True)])             

In [221]:
stop_times_schema = StructType([StructField("trip_id", StringType(), True),\
                           StructField("arrival_time", StringType(), True),\
                           StructField("departure_time", StringType(), True),\
                           StructField("stop_id", StringType(), True),\
                           StructField("stop_sequence", IntegerType(), True),\
                           StructField("pickup_type", IntegerType(), True),
                           StructField("drop_off_type", IntegerType(), True)])

In [222]:
real_stoptimes_schema = StructType([StructField("ROUTE_ID", StringType(), True),\
                           StructField("TRIP_ID", StringType(), True),\
                           StructField("STOP_ID", StringType(), True),\
                           StructField("time",IntegerType(), True)])

## Use CSV=>DF tool to read saved csv

In [223]:
real_stoptimes = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load('stops.csv', schema = real_stoptimes_schema)

In [224]:
stoptimes = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load('stop_times.txt',schema = stop_times_schema)

In [None]:
stoptimes.take(1)

In [273]:
record = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load('all.csv', schema = customSchema)

In [274]:
record.show(1)

+--------+---------+----------+--------------------+----------+--------------------+----------+--------+-------+-------------+--------------+-------+-----------+
|ROUTE_ID| latitude| longitude|       recorded_time|vehicle_id|             TRIP_ID|  tripdate|SHAPE_ID|STOP_ID|distance_stop|distance_shape| status|destination|
+--------+---------+----------+--------------------+----------+--------------------+----------+--------+-------+-------------+--------------+-------+-----------+
|     B67|40.664322|-73.983724|2015-12-30T18:59:...|       406|JG_W5-Weekday-109...|2015-12-30| B670106| 305679|        12.05|        8002.2|at stop|     801044|
+--------+---------+----------+--------------------+----------+--------------------+----------+--------+-------+-------------+--------------+-------+-----------+
only showing top 1 row



In [None]:
record.registerTempTable('record')

In [212]:
from pyspark.sql.functions import split

In [239]:
new_time = real_stoptimes.withColumn('realtime',split(pyspark.sql.functions.from_unixtime(real_stoptimes.time), ' ')[1])\
                         .withColumn('date',split(pyspark.sql.functions.from_unixtime(real_stoptimes.time), ' ')[0])

In [240]:
new_time.show(2)

+--------+--------------------+-------+----------+--------+----------+
|ROUTE_ID|             TRIP_ID|STOP_ID|      time|realtime|      date|
+--------+--------------------+-------+----------+--------+----------+
|     Q46|QV_W5-Weekday-110...| 502327|1451520016|19:00:16|2015-12-30|
|     Q46|QV_W5-Weekday-110...| 502331|1451520119|19:01:59|2015-12-30|
+--------+--------------------+-------+----------+--------+----------+
only showing top 2 rows



In [243]:
new_time.registerTempTable('new_time')

In [216]:
stoptimes.registerTempTable('stoptimes')

In [217]:
from pyspark.sql.functions import udf
def get_sec(s):
    l = s.split(':')
    return int(l[0]) * 3600 + int(l[1]) * 60 + int(l[2])
sqlContext.registerFunction("getsec", lambda x: get_sec(x), IntegerType())

## Apply getsec function to sec and calculate the delays

In [250]:
join = sqlContext.sql('SELECT ROUTE_ID,TRIP_ID,STOP_ID,realtime,date,(getsec(realtime)-getsec(arrival_time)) as delay\
                       FROM new_time\
                       INNNER JOIN stoptimes\
                       ON (TRIP_ID = trip_id AND STOP_ID = stop_id)')

In [251]:
join.show(2)

+--------+--------------------+-------+--------+----------+-----+
|ROUTE_ID|             TRIP_ID|STOP_ID|realtime|      date|delay|
+--------+--------------------+-------+--------+----------+-----+
|     B41|FB_W5-Weekday-109...| 303241|19:03:19|2015-12-30|  754|
|     B49|FB_W5-Weekday-110...| 303985|19:06:26|2015-12-30| 1046|
+--------+--------------------+-------+--------+----------+-----+
only showing top 2 rows



In [262]:
join.registerTempTable('new_join')

## Calculate the performance on 3 ways:

1. On Time performance: if the bus arrives 1min ahead of the schedule or 5mins after the schedule. It is ontime
2. Peakhour wait assesment:if the bus arrives 3min ahead or 3min after the scheduled time on 6-9 or 16-19. It is ontime
3. off-peak hour wait assesement: if the bus arrives within 5mins of the schedule except peak hours. It is ontime

In [270]:
new = sqlContext.sql('SELECT ROUTE_ID,STOP_ID, date,\
                      COUNT(IF((delay BETWEEN -60 AND 300),1,null))/COUNT(delay) as ontime_ratio,\
                      COUNT(IF((HOUR(realtime) BETWEEN 6 AND 9) OR (HOUR(realtime) BETWEEN 16 AND 19) AND (delay BETWEEN -300 AND 300),1,null))/COUNT(IF((HOUR(realtime) BETWEEN 6 AND 9) OR (HOUR(realtime) BETWEEN 16 AND 19),1,null)) as peak_wait,\
                      COUNT(IF((HOUR(realtime) NOT BETWEEN 6 AND 9) OR (HOUR(realtime) NOT BETWEEN 16 AND 19) AND (delay BETWEEN -300 AND 300),1,null))/COUNT(IF((HOUR(realtime) NOT BETWEEN 6 AND 9) OR (HOUR(realtime) NOT BETWEEN 16 AND 19),1,null)) as peak_wait\
                      FROM new_join\
                      GROUP BY ROUTE_ID, STOP_ID, date').show()

+--------+-------+----------+------------+---------+---------+
|ROUTE_ID|STOP_ID|      date|ontime_ratio|peak_wait|peak_wait|
+--------+-------+----------+------------+---------+---------+
|     B47| 306158|2015-12-30|         0.0|      0.0|      1.0|
|      B9| 300956|2015-12-30|         0.0|      0.0|      1.0|
|     B63| 306945|2015-12-30|         0.0|      0.0|      1.0|
|      B8| 300692|2015-12-30|         0.0|      0.0|      1.0|
|     B68| 300883|2015-12-30|         0.0|      0.0|      1.0|
|     B83| 306257|2015-12-30|         0.0|      0.0|      1.0|
|     B44| 303459|2015-12-30|         1.0|      1.0|      1.0|
|     B82| 300476|2015-12-30|         1.0|      1.0|      1.0|
|    B44+| 801163|2015-12-30|         1.0|      1.0|      1.0|
|     B35| 302750|2015-12-30|         0.0|      0.0|      1.0|
|     B49| 304005|2015-12-30|         0.0|      0.0|      1.0|
|     B82| 307535|2015-12-30|         1.0|      1.0|      1.0|
|     B69| 305811|2015-12-30|         1.0|      1.0|   

## Time Tansfer to UnixTimeStamp

## Calculate the trips of each line of everyday to test the data intergrety

In [None]:
gaps = sqlContext.sql('SELECT Route_Id, tripdate, count(recorded_time) AS trips\
                       FROM record\
                       GROUP BY Route_Id, tripdate\
                       ORDER BY tripdate DESC') #apply sql Query