In [87]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json

In [27]:
spark = SparkSession.builder.appName("NYC Taxi").master("local[*]").getOrCreate()

In [28]:
spark.sparkContext

In [71]:
df = spark.read.parquet("../data/*.parquet").dropna()
taxi_zone = spark.read.csv("../data/*.csv", header=True)

In [53]:
df = df.withColumn('tpep_pickup_datetime', col('tpep_pickup_datetime').cast(TimestampType()))
df = df.withColumn('tpep_dropoff_datetime', col('tpep_dropoff_datetime').cast(TimestampType()))

In [54]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [78]:
rdd = df.rdd
taxi_zone_rdd = taxi_zone.rdd

In [20]:
rdd.count()

                                                                                

33736408

In [30]:
rdd.first().__fields__

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'Airport_fee']

In [None]:
rdd_filtered = rdd.filter(lambda row: row.fare_amount > 0)

In [43]:
rdd_filtered.count()

                                                                                

33189829

In [41]:
rdd_total_revenue = rdd_filtered.map(lambda row: row.total_amount)
total_revenue = rdd_total_revenue.reduce(lambda x, y: x + y)
print(total_revenue)



965252940.2286084


                                                                                

In [42]:
rdd_total_trips = rdd_filtered.map(lambda row: 1)
total_trips = rdd_total_trips.reduce(lambda x, y: x + y)
print(total_trips)



33189829


                                                                                

In [57]:
acc_res = rdd_filtered.aggregate(
    (0, 0),
    lambda acc, row: (acc[0] + row.trip_distance, acc[1] + 1),
    lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
)

                                                                                

In [60]:
avg_trip_distance = acc_res[0] / acc_res[1]
print(avg_trip_distance)

3.52196988059172


In [None]:
rdd_group_counts = rdd_filtered.map(lambda row: ((row.tpep_pickup_datetime.month, row.tpep_pickup_datetime.day), 1)) \
                               .reduceByKey(lambda x, y: x + y)

In [105]:
rdd_group_counts = rdd_group_counts.sortByKey()
daily_trip = dict(rdd_group_counts.collect())
daily_trip_str_keys = {str(k): v for k, v in daily_trip.items()}

In [None]:
rdd_group_revenue = rdd_filtered.map(lambda row: ((row.tpep_pickup_datetime.month, row.tpep_pickup_datetime.day), row.total_amount)) \
                                .reduceByKey(lambda x, y: x +y)

In [107]:
rdd_group_revenue = rdd_group_revenue.sortByKey()
group_revenue = dict(rdd_group_revenue.collect())
daily_revenue_str_keys = {str(k): v for k, v in group_revenue.items()}

In [108]:
res = {
    "total_number_of_trips": total_trips,
    "total_revenue": total_revenue,
    "average_trip_distance": avg_trip_distance,
    "number_of_trips_per_day": daily_trip_str_keys,
    "total_revenue_per_day": daily_revenue_str_keys
}
with open('res.json', 'w') as file:
    json.dump(res, file, indent=4)