### This project parses 3 TB nested Json file into csv using pyspark along with sparksql for further analysis

# Read Json, parse

## Import dependencies

In [1]:
import pyspark
from pyspark.sql import HiveContext

## Read JSON File

In [2]:
sc
sqlContext = HiveContext(sc)
bus_file='test.jsons'
bus = sqlContext.read.json(bus_file)
bus.registerTempTable("bus")

## Show Schema

In [3]:
bus.printSchema()

root
 |-- Siri: struct (nullable = true)
 |    |-- ServiceDelivery: struct (nullable = true)
 |    |    |-- ResponseTimestamp: string (nullable = true)
 |    |    |-- SituationExchangeDelivery: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- Situations: struct (nullable = true)
 |    |    |    |    |    |-- PtSituationElement: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- Affects: struct (nullable = true)
 |    |    |    |    |    |    |    |    |-- VehicleJourneys: struct (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- AffectedVehicleJourney: array (nullable = true)
 |    |    |    |    |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |    |    |    |    |-- DirectionRef: string (nullable = true)
 |    |    |    |    |    |    |    |    |    |    |    |-- Line

## load and apply SQL Query

In [4]:
with open("spark_extract.sql") as fr:
     query = fr.read()
output = sqlContext.sql(query)

## Flatten the list

### Method A

In [5]:
import itertools
def extract(parts):
    for p in parts:
        for o in itertools.izip(p.Line,p.Latitude,p.Longitude,p.RecordedAtTime,p.vehicleID,p.Trip,p.TripDate):
            yield o

### Method B

In [110]:
def parse_list(p):
    if p.Line!=None:
        return zip(p.ROUTE_ID,p.latitude,p.longitude,p.recorded_time\
                   ,p.vehicle_id,p.TRIP_ID,p.tripdate,p.SHAPE_ID\
                   ,p.STOP_ID,p.distance_stop,p.distance_shape,p.status,p.destination)
    else:
        return []

## Tranfer time to Unix time for interpolatation

In [7]:
import time
import dateutil.parser
def unix_time(x):
    dt = dateutil.parser.parse(x)
    return time.mktime(dt.timetuple())

## Interpolate function

## Method A

In [93]:
from scipy.interpolate import interp1d # import dependency
def predict(x):
    pre_x = [p[-2] for p in x if p[-2]!=None] # use distance as x to train
    if len(pre_x) >= 2:
        pre_y = [unix_time(p[3]) for p in x if p[-2]!=None] # use time as y to train
        f = interp1d(pre_x, pre_y) #train the interpolation
    else:
        return []
    return [(p[0],p[5],p[-4],f(p[-2]+p[-3])) for p in x if p[-2]!=None and (p[-2]+p[-3]) <= pre_x[-1]] #apply interpolation

## method b

In [87]:
from scipy.interpolate import interp1d
def predict_map(x):
    train_y = [unix_time(p[3]) for p in x if p[-2]!=None ]
    if len(train_y) >= 2:
        train_x = [p[-2] for p in x if p[-2]!=None]
        f = interp1d(train_x, train_y)
        distance = [(p[-2]+p[-3]) for p in x \
                    if (p[-2]!=None and p[-3]!=None) and (p[-2]+p[-3]) <= train_x[-1]]
        stoptimes = f(distance)
        stops = [p[-4] for p in x if p[-2]!=None]
    else:
        return[]
    return map(lambda a,b: (a,b), stops,stoptimes)
    #return [(p[-4],f(p[-2]+p[-3])) for p in x if (p[-2]!=None and p[-3]!=None) and (p[-2]+p[-3]) <= pre_x[-1]] 

## Groupby Date and Line & Apply Interpolation

## Simple Extraction

In [105]:
output.flatMap(parse_list)\
      .take(2)

[(u'MTA NYCT_Q76',
  40.753476,
  -73.780631,
  u'2015-12-30T18:59:41.621-05:00',
  u'MTA NYCT_7424',
  u'MTA NYCT_CS_W5-Weekday-111000_MISC_843',
  u'2015-12-30',
  u'MTA_Q760118',
  u'MTA_502697',
  16.14,
  7903.95,
  u'at stop'),
 (u'MTA NYCT_B67',
  40.664322,
  -73.983724,
  u'2015-12-30T18:59:52.000-05:00',
  u'MTA NYCT_406',
  u'MTA NYCT_JG_W5-Weekday-109900_B6769_24',
  u'2015-12-30',
  u'MTA_B670106',
  u'MTA_305679',
  12.05,
  8002.2,
  u'at stop')]

## Group By TRIP_ID and Date

In [109]:
output.flatMap(parse_list)\
      .map(lambda x:((x[5],x[6]),x)).groupByKey().mapValues(list) \
      .flatMap(lambda x: x[1])\
      .take(1)

[(u'MTA NYCT_Q46',
  40.734032,
  -73.756463,
  u'2015-12-30T18:59:49.000-05:00',
  u'MTA NYCT_9564',
  u'MTA NYCT_QV_W5-Weekday-110900_Q46_6',
  u'2015-12-30',
  u'MTA_Q460137',
  u'MTA_502327',
  147.08,
  7379.36,
  u'approaching')]

## Groupbykey and Apply Interpolation

In [107]:
output.flatMap(parse_list)\
      .map(lambda x:((x[5],x[6]),x)).groupByKey() \
      .flatMap(lambda x: predict(x[1]))\
       .take(100)

[(u'MTA NYCT_Q46',
  u'MTA NYCT_QV_W5-Weekday-110900_Q46_6',
  u'MTA_502327',
  array(1451520016.346892)),
 (u'MTA NYCT_Q46',
  u'MTA NYCT_QV_W5-Weekday-110900_Q46_6',
  u'MTA_502331',
  array(1451520119.0)),
 (u'MTA NYCT_Q20A',
  u'MTA NYCT_CS_W5-Weekday-113900_MISC_769',
  u'MTA_505023',
  array(1451520122.033537)),
 (u'MTA NYCT_Q20A',
  u'MTA NYCT_CS_W5-Weekday-113900_MISC_769',
  u'MTA_505024',
  array(1451520122.0558827)),
 (u'MTA NYCT_Q20A',
  u'MTA NYCT_CS_W5-Weekday-113900_MISC_769',
  u'MTA_505026',
  array(1451520338.3900702)),
 (u'MTA NYCT_Q20A',
  u'MTA NYCT_CS_W5-Weekday-113900_MISC_769',
  u'MTA_505027',
  array(1451520330.5580943)),
 (u'MTA NYCT_BX8',
  u'MTA NYCT_WF_W5-Weekday-110800_BX8_13',
  u'MTA_100947',
  array(1451520014.054865)),
 (u'MTA NYCT_BX8',
  u'MTA NYCT_WF_W5-Weekday-110800_BX8_13',
  u'MTA_103181',
  array(1451520125.8062081)),
 (u'MTA NYCT_BX8',
  u'MTA NYCT_WF_W5-Weekday-110800_BX8_13',
  u'MTA_103183',
  array(1451520377.7902856)),
 (u'MTA NYCT_BX8',

## Remove the prefix, timezones and save as CSV

In [None]:
output.flatMap(parse_list)\
      .map(lambda x: ",".join(map(str, x)))\
      .map(lambda x: x.replace('MTA NYCT_', '').replace('MTABC_','').replace('MTA_','').replace('-05:00',''))\
      .saveAsTextFile(sys.argv[-1])

# Read From CSV and SQL Manupilation

## Import dependencies

In [112]:
from pyspark.sql.types import *

## Reset Schemas and Indexing

In [113]:
customSchema = StructType([StructField("ROUTE_ID", StringType(), True),\
                           StructField("latitude", DoubleType(), True),\
                           StructField("longitude", DoubleType(), True),\
                           StructField("recorded_time", StringType(), True),\
                           StructField("vehicle_id", StringType(), True),\
                           StructField("TRIP_ID", StringType(), True),\
                           StructField("tripdate", DateType(), True),\
                           StructField("SHAPE_ID", StringType(), True),\
                           StructField("STOP_ID", StringType(), True),\
                           StructField("distance_stop", DoubleType(), True),\
                           StructField("distance_shape", DoubleType(), True),\
                           StructField("status", StringType(), True),\
                           StructField("destination", StringType(), True)])             

## Use CSV=>DF tool to read saved csv

In [115]:
record = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load('final.csv', schema = customSchema)

In [116]:
record.registerTempTable('record')

In [117]:
import pyspark.sql.functions as f

## Time Tansfer to UnixTimeStamp

## Calculate the trips of each line of everyday to test the data intergrety

In [136]:
gaps = sqlContext.sql('SELECT Route_Id, tripdate, count(recorded_time) AS trips\
                       FROM record\
                       GROUP BY Route_Id, tripdate\
                       ORDER BY tripdate DESC') #apply sql Query

In [137]:
gaps.show(3)

+--------+----------+-----+
|Route_Id|  tripdate|trips|
+--------+----------+-----+
|     M79|2015-12-30|   40|
|     S98|2015-12-30|   14|
|      Q3|2015-12-30|   42|
+--------+----------+-----+
only showing top 3 rows



## Next Step _ Merge Data from Stoptimes to interpolated times

In [None]:
combine = sqlContext.sql(
    'SELECT record.TripRef, record.Stop_ID, record.RecordedAtTime \
    FROM record \
    JOIN stop_times \
    on (record.Stop_ID = stop_times.stop_id AND record.TripRef = stop_times.trip_id)')

In [None]:
#output_df = output1.toDF(['Line','Lat','Lon','Recordtime','ID','Trip','TripDate'])

In [None]:
#output_df.show()