### This project parses 3 TB nested Json file into csv using pyspark along with sparksql for further analysis

# Read Json, parse

## Import dependencies

In [1]:
import pyspark
from pyspark.sql import HiveContext

## Read JSON File

In [2]:
sc
sqlContext = HiveContext(sc)
bus_file='test.jsons'
bus = sqlContext.read.json(bus_file)
bus.registerTempTable("bus")

In [3]:
bus.printSchema()

root
 |-- Siri: struct (nullable = true)
 |    |-- ServiceDelivery: struct (nullable = true)
 |    |    |-- ResponseTimestamp: string (nullable = true)
 |    |    |-- SituationExchangeDelivery: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- Situations: struct (nullable = true)
 |    |    |    |    |    |-- PtSituationElement: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- Affects: struct (nullable = true)
 |    |    |    |    |    |    |    |    |-- VehicleJourneys: struct (nullable = true)
 |    |    |    |    |    |    |    |    |    |-- AffectedVehicleJourney: array (nullable = true)
 |    |    |    |    |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |    |    |    |    |-- DirectionRef: string (nullable = true)
 |    |    |    |    |    |    |    |    |    |    |    |-- Line

## load and apply SQL Query

In [4]:
with open("spark_extract.sql") as fr:
     query = fr.read()
output = sqlContext.sql(query)

## Flatten the list

### Method A

In [5]:
import itertools
def extract(parts):
    for p in parts:
        for o in itertools.izip(p.Line,p.Latitude,p.Longitude,p.RecordedAtTime,p.vehicleID,p.Trip,p.TripDate):
            yield o

### Method B

In [6]:
def parse_list(p):
    if p.Line!=None:
        return zip(p.Line,p.Latitude,p.Longitude,p.RecordedAtTime,p.vehicleID,p.Trip,p.TripDate,p.TripPattern,p.MonitoredCallRef,p.DistFromCall,p.CallDistAlongRoute,p.PresentableDistance)
    else:
        return []

## Tranfer time to Unix time for interpolatation

In [7]:
import time
import dateutil.parser
def unix_time(x):
    dt = dateutil.parser.parse(x)
    return time.mktime(dt.timetuple())

In [17]:
a= [1,2,3,5,6]
b =[3,4,6,7,8]
f = interp1d(a,b)

In [20]:
f([1,2,4,4])

array([ 3. ,  4. ,  6.5,  6.5])

In [30]:
a = [1,2]

## Interpolate function

## Method A

In [93]:
from scipy.interpolate import interp1d
def predict(x):
    pre_x = [p[-2] for p in x if p[-2]!=None]
    if len(pre_x) >= 2:
        pre_y = [unix_time(p[3]) for p in x if p[-2]!=None]
        f = interp1d(pre_x, pre_y)
    else:
        return []
    return [(p[0],p[5],p[-4],f(p[-2]+p[-3])) for p in x if p[-2]!=None and (p[-2]+p[-3]) <= pre_x[-1]]

## method b

In [87]:
from scipy.interpolate import interp1d
def predict_map(x):
    train_y = [unix_time(p[3]) for p in x if p[-2]!=None ]
    if len(train_y) >= 2:
        train_x = [p[-2] for p in x if p[-2]!=None]
        f = interp1d(train_x, train_y)
        distance = [(p[-2]+p[-3]) for p in x \
                    if (p[-2]!=None and p[-3]!=None) and (p[-2]+p[-3]) <= train_x[-1]]
        stoptimes = f(distance)
        stops = [p[-4] for p in x if p[-2]!=None]
    else:
        return[]
    return map(lambda a,b: (a,b), stops,stoptimes)
    #return [(p[-4],f(p[-2]+p[-3])) for p in x if (p[-2]!=None and p[-3]!=None) and (p[-2]+p[-3]) <= pre_x[-1]] 

## Groupby Date and Line & Apply Interpolation

## Simple Extraction

In [105]:
output.flatMap(parse_list)\
      .take(2)

[(u'MTA NYCT_Q76',
  40.753476,
  -73.780631,
  u'2015-12-30T18:59:41.621-05:00',
  u'MTA NYCT_7424',
  u'MTA NYCT_CS_W5-Weekday-111000_MISC_843',
  u'2015-12-30',
  u'MTA_Q760118',
  u'MTA_502697',
  16.14,
  7903.95,
  u'at stop'),
 (u'MTA NYCT_B67',
  40.664322,
  -73.983724,
  u'2015-12-30T18:59:52.000-05:00',
  u'MTA NYCT_406',
  u'MTA NYCT_JG_W5-Weekday-109900_B6769_24',
  u'2015-12-30',
  u'MTA_B670106',
  u'MTA_305679',
  12.05,
  8002.2,
  u'at stop')]

## Apply Interpolation

In [106]:
output.flatMap(parse_list)\
      .map(lambda x:((x[5],x[6]),x)).groupByKey() \
      .flatMap(lambda x: predict(x[1]))\
      .take(2)

[(u'MTA NYCT_Q46',
  u'MTA NYCT_QV_W5-Weekday-110900_Q46_6',
  u'MTA_502327',
  array(1451520016.346892)),
 (u'MTA NYCT_Q46',
  u'MTA NYCT_QV_W5-Weekday-110900_Q46_6',
  u'MTA_502331',
  array(1451520119.0))]

## Remove the prefix, timezones and save as CSV

In [None]:
output.flatMap(parse_list)\
      .map(lambda x: ",".join(map(str, x)))\
      .map(lambda x: x.replace('MTA NYCT_', '').replace('MTABC_','').replace('MTA_','').replace('-05:00',''))\
      .saveAsTextFile(sys.argv[-1])

# Read From CSV and SQL Manupilation

## Import dependencies

In [None]:
from pyspark.sql.types import *

## Reset Schemas and Indexing

In [None]:
customSchema = StructType([StructField("Line", StringType(), True),\
                           StructField("Latitude", DoubleType(), True),\
                           StructField("Longitude", DoubleType(), True),\
                           StructField("RecordedAtTime", StringType(), True),\
                           StructField("vehicleID", StringType(), True),\
                           StructField("TripRef", StringType(), True),\
                           StructField("TripDate", DateType(), True),\
                           StructField("TripPattern", StringType(), True),\
                           StructField("Stop_ID", StringType(), True),\
                           StructField("DistFromCall", DoubleType(), True),\
                           StructField("CallDistAlongRoute", DoubleType(), True),\
                           StructField("PresentableDistance", StringType(), True)])                 

## Use CSV=>DF tool to read saved csv

In [None]:
record = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load('final.csv', schema = customSchema)

In [None]:
record.registerTempTable('record')

In [None]:
record.take(1)

In [None]:
import pyspark.sql.functions as f

## Time Tansfer to UnixTimeStamp

In [None]:
record_new = record.withColumn("Record_time_unix", f.unix_timestamp('RecordedAtTime', format="yyyy-MM-dd'T'HH:mm:ss.SSS"))
record_new.registerTempTable('record')

In [None]:
record_new.take(1)

## Calculate the trips of each line of everyday to test the data intergrety

In [None]:
gaps = sqlContext.sql('select TripRef, Record_time_unix, DistFromCall,CallDistAlongRoute,Stop_ID\
                       from record\
                       ORDER BY Record_time_unix DESC')

In [None]:
gaps.take(10)

## Merge Data from Stoptimes to interpolated times

In [None]:
combine = sqlContext.sql(
    'SELECT record.TripRef, record.Stop_ID, record.RecordedAtTime \
    FROM record \
    JOIN stop_times \
    on (record.Stop_ID = stop_times.stop_id AND record.TripRef = stop_times.trip_id)')

In [None]:
#output_df = output1.toDF(['Line','Lat','Lon','Recordtime','ID','Trip','TripDate'])

In [None]:
#output_df.show()