In [0]:

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import *
import datetime
spark = SparkSession.builder.getOrCreate()

In [0]:
tax_data_df = spark.read\
    .option('inferSchema', True)\
    .parquet("s3a://taxi-data-databb/taxi-data/yellow_tripdata_2021-01.parquet")\
    .createOrReplaceTempView('yellow_taxi_data')

In [0]:
taxi_data = spark.table('yellow_taxi_data')
taxi_data.limit(10).toPandas()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0,
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0,
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0,
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5,
5,1,2021-01-01 00:16:29,2021-01-01 00:24:30,1.0,1.6,1.0,N,224,68,1,8.0,3.0,0.5,2.35,0.0,0.3,14.15,2.5,
6,1,2021-01-01 00:00:28,2021-01-01 00:17:28,1.0,4.1,1.0,N,95,157,2,16.0,0.5,0.5,0.0,0.0,0.3,17.3,0.0,
7,1,2021-01-01 00:12:29,2021-01-01 00:30:34,1.0,5.7,1.0,N,90,40,2,18.0,3.0,0.5,0.0,0.0,0.3,21.8,2.5,
8,1,2021-01-01 00:39:16,2021-01-01 01:00:13,1.0,9.1,1.0,N,97,129,4,27.5,0.5,0.5,0.0,0.0,0.3,28.8,0.0,
9,1,2021-01-01 00:26:12,2021-01-01 00:39:46,2.0,2.7,1.0,N,263,142,1,12.0,3.0,0.5,3.15,0.0,0.3,18.95,2.5,


In [0]:
taxi_data = taxi_data \
    .withColumn('seconds_trip_time', F.col('tpep_dropoff_datetime').cast("long") - F.col('tpep_pickup_datetime').cast("long"))







In [0]:
lookup_data_df = spark.read \
    .option('header',True)\
    .option('inferSchema', True)\
    .csv("s3a://taxi-data-databb/lookup/taxi+_zone_lookup.csv")\
    .createOrReplaceTempView('lookup_data')

In [0]:
lookup_data = spark.table('lookup_data')
lookup_data.toPandas().head(5)

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [0]:
result = taxi_data.join(lookup_data, taxi_data.PULocationID == lookup_data.LocationID, how = 'inner')


In [0]:
result = result.select(
    F.col('passenger_count'), 
    F.col('trip_distance'), 
    F.col('PULocationID'), 
    F.col('DOLocationID'), 
    F.col('seconds_trip_time'), 
    F.col('total_amount'), 
    F.col('Zone').alias('zone_from'))

result.toPandas().head(5)

Unnamed: 0,passenger_count,trip_distance,PULocationID,DOLocationID,seconds_trip_time,total_amount,zone_from
0,1.0,2.1,142,43,362,11.8,Lincoln Square East
1,1.0,0.2,238,151,59,4.3,Upper West Side North
2,1.0,14.7,132,165,1656,51.95,JFK Airport
3,0.0,10.6,138,132,913,36.35,LaGuardia Airport
4,1.0,4.94,68,33,992,24.36,East Chelsea


In [0]:
result = result.join(lookup_data, result.DOLocationID == lookup_data.LocationID, how = 'inner')

In [0]:
result.toPandas().head(5)    

Unnamed: 0,passenger_count,trip_distance,PULocationID,DOLocationID,seconds_trip_time,total_amount,zone_from,LocationID,Borough,Zone,service_zone
0,1.0,2.1,142,43,362,11.8,Lincoln Square East,43,Manhattan,Central Park,Yellow Zone
1,1.0,0.2,238,151,59,4.3,Upper West Side North,151,Manhattan,Manhattan Valley,Yellow Zone
2,1.0,14.7,132,165,1656,51.95,JFK Airport,165,Brooklyn,Midwood,Boro Zone
3,0.0,10.6,138,132,913,36.35,LaGuardia Airport,132,Queens,JFK Airport,Airports
4,1.0,4.94,68,33,992,24.36,East Chelsea,33,Brooklyn,Brooklyn Heights,Boro Zone


In [0]:
result_df = result.select(
    F.col('trip_distance'), 
    F.col('seconds_trip_time'), 
    F.col('total_amount'), 
    F.col('zone_from'),
    F.col('Zone').alias('zone_to'))\
    .where(F.col('seconds_trip_time') > 0)

In [0]:
result_df.toPandas().head(5)

Unnamed: 0,trip_distance,seconds_trip_time,total_amount,zone_from,zone_to
0,2.1,362,11.8,Lincoln Square East,Central Park
1,0.2,59,4.3,Upper West Side North,Manhattan Valley
2,14.7,1656,51.95,JFK Airport,Midwood
3,10.6,913,36.35,LaGuardia Airport,JFK Airport
4,4.94,992,24.36,East Chelsea,Brooklyn Heights


In [0]:
result_df = result_df\
    .groupBy(F.col('zone_from'), F.col('zone_to'))\
    .agg(avg(F.col("seconds_trip_time")).alias("avg_sec_trip"), \
         avg((F.col("trip_distance")) * 1.609) .alias("avg_trip_distance_km")) \
    .filter(F.col('avg_trip_distance_km')>10)


In [0]:
result_df.toPandas().head(5)

Unnamed: 0,zone_from,zone_to,avg_sec_trip,avg_trip_distance_km
0,West Chelsea/Hudson Yards,Erasmus,1914.0,14.99588
1,UN/Turtle Bay South,Park Slope,1615.407407,13.876731
2,JFK Airport,Parkchester,1691.941176,28.953482
3,Little Italy/NoLiTa,Manhattan Valley,1405.454545,10.87684
4,Central Harlem,Brownsville,2921.4,29.756846


In [0]:
result_df = result_df \
    .select(
        F.col('zone_from'), 
        F.col('zone_to'), 
        (F.col('avg_trip_distance_km') / (F.col('avg_sec_trip')/3600)).alias('avg_speed_km_h'))

In [0]:
result_df.toPandas().head(30)

Unnamed: 0,zone_from,zone_to,avg_speed_km_h
0,West Chelsea/Hudson Yards,Erasmus,28.205417
1,UN/Turtle Bay South,Park Slope,30.92485
2,JFK Airport,Parkchester,61.605294
3,Little Italy/NoLiTa,Manhattan Valley,27.86047
4,Central Harlem,Brownsville,36.668941
5,Sutton Place/Turtle Bay North,Coney Island,23.892417
6,Upper East Side North,Elmhurst,23.440358
7,East Village,JFK Airport,53.107858
8,Midtown Center,Inwood,24.332052
9,Upper East Side North,South Jamaica,29.127839


In [0]:
result_df.write \
 .mode("OVERWRITE") \
 .option("header","true") \
 .csv("s3a://taxi-data-databb/clean/taxi_data.csv")