In [0]:
%pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, lit, to_timestamp, avg, month, when, count, current_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

spark = SparkSession.builder\
    .appName('fhvhv_cleaned')\
    .master('local')\
    .getOrCreate()

schema = StructType([
    StructField("hvfhs_license_num", StringType(), True),
    StructField("dispatching_base_num", StringType(), True),
    StructField("originating_base_num", StringType(), True),
    StructField("request_datetime", StringType(), True),
    StructField("on_scene_datetime", StringType(), True),
    StructField("pickup_datetime", StringType(), True),
    StructField("dropoff_datetime", StringType(), True),
    StructField("PULocationID", IntegerType(), True),
    StructField("DOLocationID", IntegerType(), True),
    StructField("trip_miles", FloatType(), True),
    StructField("trip_time", IntegerType(), True),
    StructField("base_passenger_fare", FloatType(), True),
    StructField("tolls", FloatType(), True),
    StructField("bcf", FloatType(), True),
    StructField("sales_tax", FloatType(), True),
    StructField("congestion_surcharge", FloatType(), True),
    StructField("airport_fee", FloatType(), True),
    StructField("tips", FloatType(), True),
    StructField("driver_pay", FloatType(), True),
    StructField("shared_request_flag", StringType(), True),
    StructField("shared_match_flag", StringType(), True),
    StructField("access_a_ride_flag", StringType(), True),
    StructField("wav_request_flag", StringType(), True),
    StructField("wav_match_flag", StringType(), True)
])

df = spark.read.csv("hdfs://nodemastertah:9000/NYCtaxi/dataset/fhvhv/fhvhv_csv/*.csv", header=True, schema=schema)

df =  df.withColumn("request_datetime", col("request_datetime").cast("timestamp")) \
    .withColumn("on_scene_datetime", col("on_scene_datetime").cast("timestamp")) \
    .withColumn("pickup_datetime", col("pickup_datetime").cast("timestamp")) \
    .withColumn("dropoff_datetime", col("dropoff_datetime").cast("timestamp"))\
    .withColumn("PULocationID", col("PULocationID").cast("double")) \
    .withColumn("DOLocationID", col("DOLocationID").cast("double")) \
    .withColumn("trip_miles", col("trip_miles").cast("double")) \
    .withColumn("trip_time", col("trip_time").cast("double")) \
    .withColumn("base_passenger_fare", col("base_passenger_fare").cast("double")) \
    .withColumn("tolls", col("tolls").cast("double")) \
    .withColumn("bcf", col("bcf").cast("double")) \
    .withColumn("sales_tax", col("sales_tax").cast("double")) \
    .withColumn("congestion_surcharge", col("congestion_surcharge").cast("double")) \
    .withColumn("airport_fee", col("airport_fee").cast("double")) \
    .withColumn("tips", col("tips").cast("double")) \
    .withColumn("driver_pay", col("driver_pay").cast("double"))

df.printSchema()
df.show(5)

df_filled = df.withColumn(
    "on_scene_datetime",
    when(col("on_scene_datetime").isNull(), lit(None)).otherwise(col("on_scene_datetime").cast("timestamp"))
).fillna({
    "originating_base_num": "N/A",
    "airport_fee": 0
}).withColumn(
    "wav_match_flag",
    when(col("wav_match_flag").isNull(), col("wav_request_flag")).otherwise(col("wav_match_flag"))
)

df_add = df_filled.withColumn(
    "Total_Amt",
    (col("base_passenger_fare") +
     col("tolls") +
     col("tips") +
     col("congestion_surcharge") +
     col("airport_fee") +
     col("driver_pay"))
)


df_cleaned_fhvhv = df_add.filter(col("trip_miles") > 0)

legal_rows_count = df_cleaned_fhvhv.count()

print(f"clean data:")

df_cleaned_fhvhv.show(5)
print(f"Number of legal rows: {legal_rows_count}")



In [1]:
%pyspark
from pyspark.sql.functions import count

location_heatmap = df_cleaned.groupBy("PULocationID", "DOLocationID").agg(count("*").alias("trip_count"))
location_heatmap = location_heatmap.orderBy(col("trip_count").desc())

z.show(location_heatmap)


In [2]:
%pyspark
congestion_effect = df_cleaned.groupBy("congestion_surcharge").agg({"base_passenger_fare": "sum"}).orderBy("congestion_surcharge")

z.show(congestion_effect)


In [3]:
%pyspark
from pyspark.sql.functions import hour

df = df_cleaned.withColumn("hour", hour("pickup_datetime"))

demand_supply = df.groupBy("hour", "PULocationID").count()

z.show(demand_supply)


In [4]:
%pyspark
low_profit_routes = df_cleaned.groupBy("PULocationID", "DOLocationID").agg(sum("base_passenger_fare").alias("total_revenue"))

z.show(low_profit_routes.filter(col("total_revenue") < 10))


In [5]:
%pyspark
from pyspark.sql.functions import unix_timestamp

df = df_cleaned.withColumn("trip_duration", 
    (unix_timestamp("dropoff_datetime") - unix_timestamp("pickup_datetime")) / 60)
df = df.withColumn("pickup_hour", hour("pickup_datetime"))

hourly_duration = df.groupBy("pickup_hour").agg({"trip_duration": "avg"}).orderBy("pickup_hour")

z.show(hourly_duration)


In [6]:
%pyspark
from pyspark.sql.functions import dayofweek, count

trip_demand = df_cleaned_fhvhv.groupBy(dayofweek("pickup_datetime").alias("Day_of_Week")) \
    .agg(count("*").alias("Trip_Count")) \
    .orderBy("Day_of_Week")

z.show(trip_demand)


In [7]:
%pyspark
from pyspark.sql.functions import (col, avg)

tip_percentage = df_cleaned_fhvhv.withColumn("tip_percent", (col("tips") / col("base_passenger_fare")) * 100) \
    .groupBy("shared_request_flag") \
    .agg(avg("tip_percent").alias("Avg_Tip_Percentage")) \
    .orderBy("Avg_Tip_Percentage", ascending=False)

z.show(tip_percentage)


In [8]:
%pyspark
from pyspark.sql.functions import count

airport_trips = df_cleaned_fhvhv.filter(col("PULocationID").isin([132, 138]) | col("DOLocationID").isin([132, 138])) \
    .groupBy("PULocationID", "DOLocationID") \
    .agg(count("*").alias("Trip_Count")) \
    .orderBy("Trip_Count", ascending=False)

z.show(airport_trips)


In [9]:
%pyspark
from pyspark.sql.functions import avg

earnings_vs_distance = df_cleaned_fhvhv.groupBy("trip_miles") \
    .agg(avg("driver_pay").alias("Avg_Driver_Pay")) \
    .orderBy("trip_miles")

z.show(earnings_vs_distance)
