In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import date, datetime

In [27]:
spark = SparkSession.builder.appName("NYC Taxi").master("local[*]").getOrCreate()

In [28]:
spark.sparkContext

In [71]:
df = spark.read.parquet("../data/*.parquet").dropna()
taxi_zone = spark.read.csv("../data/*.csv", header=True)

In [53]:
df = df.withColumn('tpep_pickup_datetime', col('tpep_pickup_datetime').cast(TimestampType()))
df = df.withColumn('tpep_dropoff_datetime', col('tpep_dropoff_datetime').cast(TimestampType()))

In [54]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [78]:
rdd = df.rdd
taxi_zone_rdd = taxi_zone.rdd

In [20]:
rdd.count()

                                                                                

33736408

In [30]:
rdd.first().__fields__

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'Airport_fee']

In [None]:
rdd_filtered = rdd.filter(lambda row: row.fare_amount > 0)

In [43]:
rdd_filtered.count()

                                                                                

33189829

In [41]:
rdd_total_revenue = rdd_filtered.map(lambda row: row.total_amount)
total_revenue = rdd_total_revenue.reduce(lambda x, y: x + y)
print(total_revenue)



965252940.2286084


                                                                                

In [42]:
rdd_total_trips = rdd_filtered.map(lambda row: 1)
total_trips = rdd_total_trips.reduce(lambda x, y: x + y)
print(total_trips)



33189829


                                                                                

In [57]:
acc_res = rdd_filtered.aggregate(
    (0, 0),
    lambda acc, row: (acc[0] + row.trip_distance, acc[1] + 1),
    lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
)

                                                                                

In [60]:
avg_trip_distance = acc_res[0] / acc_res[1]
print(avg_trip_distance)

3.52196988059172


In [None]:
rdd_group_counts = rdd_filtered.map(lambda row: ((row.tpep_pickup_datetime.month, row.tpep_pickup_datetime.day), 1)) \
                               .reduceByKey(lambda x, y: x + y)

In [67]:
rdd_group_counts = rdd_group_counts.sortByKey()
rdd_group_counts.collect()

[((1, 1), 69289),
 ((1, 2), 72252),
 ((1, 3), 79313),
 ((1, 4), 98985),
 ((1, 5), 98770),
 ((1, 6), 90686),
 ((1, 7), 63992),
 ((1, 8), 76873),
 ((1, 9), 85572),
 ((1, 10), 90981),
 ((1, 11), 100480),
 ((1, 12), 98049),
 ((1, 13), 98795),
 ((1, 14), 87778),
 ((1, 15), 73255),
 ((1, 16), 86611),
 ((1, 17), 101139),
 ((1, 18), 103271),
 ((1, 19), 90687),
 ((1, 20), 99037),
 ((1, 21), 78079),
 ((1, 22), 81578),
 ((1, 23), 94544),
 ((1, 24), 100310),
 ((1, 25), 103885),
 ((1, 26), 99990),
 ((1, 27), 104555),
 ((1, 28), 84227),
 ((1, 29), 80926),
 ((1, 30), 96888),
 ((1, 31), 97450),
 ((2, 1), 104373),
 ((2, 2), 100402),
 ((2, 3), 103326),
 ((2, 4), 82961),
 ((2, 5), 84954),
 ((2, 6), 97508),
 ((2, 7), 98189),
 ((2, 8), 107761),
 ((2, 9), 103558),
 ((2, 10), 105087),
 ((2, 11), 87643),
 ((2, 12), 87618),
 ((2, 13), 65904),
 ((2, 14), 111229),
 ((2, 15), 109632),
 ((2, 16), 97121),
 ((2, 17), 93355),
 ((2, 18), 82931),
 ((2, 19), 72074),
 ((2, 20), 92434),
 ((2, 21), 98936),
 ((2, 22), 10797

In [68]:
rdd_group_revenue = rdd_filtered.map(lambda row: ((row.tpep_pickup_datetime.month, row.tpep_pickup_datetime.day), row.total_amount)) \
                                .reduceByKey(lambda x, y: x +y)

rdd_group_revenue = rdd_group_revenue.sortByKey()
rdd_group_revenue.collect()

                                                                                

[((1, 1), 2176241.2999998657),
 ((1, 2), 2248171.0299998727),
 ((1, 3), 2330065.8099999223),
 ((1, 4), 2760258.059999921),
 ((1, 5), 2676444.3199999374),
 ((1, 6), 2343107.289999937),
 ((1, 7), 1861641.5399998736),
 ((1, 8), 2184744.189999933),
 ((1, 9), 2218069.3599999845),
 ((1, 10), 2493877.8299999824),
 ((1, 11), 2840286.3699999033),
 ((1, 12), 2772979.2499999013),
 ((1, 13), 2559243.6499999384),
 ((1, 14), 2360793.559999905),
 ((1, 15), 2111478.32999989),
 ((1, 16), 2515050.2899999493),
 ((1, 17), 2797943.939999905),
 ((1, 18), 2820290.6799999108),
 ((1, 19), 2401037.1399999666),
 ((1, 20), 2448585.259999922),
 ((1, 21), 2165884.749999908),
 ((1, 22), 2308896.4599999175),
 ((1, 23), 2558888.85999995),
 ((1, 24), 2724916.999999934),
 ((1, 25), 2898065.4499998884),
 ((1, 26), 2729972.779999961),
 ((1, 27), 2632053.2199999923),
 ((1, 28), 2288069.329999916),
 ((1, 29), 2296023.4199999226),
 ((1, 30), 2585249.539999969),
 ((1, 31), 2622121.0999999945),
 ((2, 1), 2896187.1399999196),
 