In [0]:
%pyspark
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col, concat, lit, to_timestamp, year, avg, month, count, weekofyear, rand, when
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, BooleanType, FloatType, \
    DoubleType, TimestampType

spark = SparkSession.builder \
    .appName("fhv_data_processing") \
    .master("local") \
    .getOrCreate()

schema_fhv = StructType([
    StructField("dispatching_base_num", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropOff_datetime", TimestampType(), True),
    StructField("PUlocationID", DoubleType(), True),
    StructField("DOlocationID", DoubleType(), True),
    StructField("SR_Flag", IntegerType(), True),
    StructField("Affiliated_base_number", StringType(), True)
])


df_fhv = spark.read.csv("hdfs://nodemastertah:9000/NYCtaxi/dataset/fhv/fhv_csv/*.csv", header=True, schema=schema_fhv)

df_fhv.show(5)

df_fill = df_fhv.fillna({"SR_Flag": 0})

df_cleaned = df_fill.dropna(subset=["pickup_datetime", "dropOff_datetime", "PUlocationID", "DOlocationID"]) \
       .filter((col("PUlocationID") > 0) & (col("DOlocationID") > 0))

df_cleaned.show(5)



In [1]:
%pyspark
from pyspark.sql.functions import hour, count

peak_hours = df_cleaned.withColumn("hour", hour("pickup_datetime")) \
                       .groupBy("hour") \
                       .agg(count("*").alias("trip_count")) \
                       .orderBy("hour")

z.show(peak_hours)


In [2]:
%pyspark
from pyspark.sql.functions import col, count

location_heatmap = df_cleaned.groupBy("PUlocationID", "DOlocationID") \
                             .agg(count("*").alias("trip_count")) \
                             .orderBy(col("trip_count").desc())

z.show(location_heatmap)


In [3]:
%pyspark
from pyspark.sql.functions import unix_timestamp, avg

trip_duration = df_cleaned.withColumn("duration", 
                 (unix_timestamp("dropOff_datetime") - unix_timestamp("pickup_datetime")) / 60) \
             .groupBy("dispatching_base_num") \
             .agg(avg("duration").alias("avg_trip_duration")) \
             .orderBy(col("avg_trip_duration").desc())

z.show(trip_duration)


In [4]:
%pyspark
from pyspark.sql.functions import hour

hourly_trips = df_cleaned.withColumn("hour", hour("pickup_datetime")) \
                         .groupBy("dispatching_base_num", "hour") \
                         .count() \
                         .orderBy("dispatching_base_num", "hour")

z.show(hourly_trips)


In [5]:
%pyspark
from pyspark.sql.functions import hour, count

late_night_trips = df_cleaned.withColumn("hour", hour("pickup_datetime")) \
                             .filter("hour >= 22 OR hour <= 4") \
                             .groupBy("PUlocationID", "DOlocationID") \
                             .agg(count("*").alias("trip_count")) \
                             .orderBy(col("trip_count").desc())

z.show(late_night_trips)


In [6]:
%pyspark
from pyspark.sql.functions import dayofweek

weekday_vs_weekend = df_cleaned.withColumn("day_of_week", dayofweek("pickup_datetime")) \
                               .withColumn("day_type", 
                                   when(col("day_of_week").isin([1,7]), "Weekend").otherwise("Weekday")) \
                               .groupBy("day_type") \
                               .agg(count("*").alias("trip_count"))

z.show(weekday_vs_weekend)


In [7]:
%pyspark
from pyspark.sql.functions import count

ghost_bases = df_cleaned.groupBy("dispatching_base_num") \
                        .agg(count("*").alias("trip_count")) \
                        .filter("trip_count < 100") \
                        .orderBy("trip_count")

z.show(ghost_bases)
