In [0]:
%pyspark

from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col, concat, lit, to_timestamp, year, avg, month, count, weekofyear, rand, when
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, BooleanType, FloatType,DoubleType

spark = SparkSession.builder\
    .appName('green_taxi_filter')\
    .master('local')\
    .getOrCreate()

schema = StructType([
    StructField("VendorID", StringType(), True),
    StructField("lpep_pickup_datetime", StringType(), True),
    StructField("lpep_dropoff_datetime", StringType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("RatecodeID", StringType(), True),
    StructField("PULocationID", StringType(), True),
    StructField("DOLocationID", StringType(), True),
    StructField("passenger_count", StringType(), True),
    StructField("trip_distance", StringType(), True),
    StructField("fare_amount", StringType(), True),
    StructField("extra", StringType(), True),
    StructField("mta_tax", StringType(), True),
    StructField("tip_amount", StringType(), True),
    StructField("tolls_amount", StringType(), True),
    StructField("ehail_fee", StringType(), True),
    StructField("improvement_surcharge", StringType(), True),
    StructField("total_amount", StringType(), True),
    StructField("payment_type", StringType(), True),
    StructField("trip_type", StringType(), True),
    StructField("congestion_surcharge", StringType(), True)
])


df = spark.read.csv("hdfs://nodemastertah:9000/NYCtaxi/dataset/green_taxi/green_csv/*.csv", header=True, schema=schema)


df = df.withColumn("VendorID", col("VendorID").cast("integer")) \
       .withColumn("lpep_pickup_datetime", col("lpep_pickup_datetime").cast("timestamp")) \
       .withColumn("lpep_dropoff_datetime", col("lpep_dropoff_datetime").cast("timestamp")) \
       .withColumn("store_and_fwd_flag", col("store_and_fwd_flag").cast("string")) \
       .withColumn("RatecodeID", col("RatecodeID").cast("integer")) \
       .withColumn("PULocationID", col("PULocationID").cast("double")) \
       .withColumn("DOLocationID", col("DOLocationID").cast("double")) \
       .withColumn("passenger_count", col("passenger_count").cast("integer")) \
       .withColumn("trip_distance", col("trip_distance").cast("double")) \
       .withColumn("fare_amount", col("fare_amount").cast("double")) \
       .withColumn("extra", col("extra").cast("double")) \
       .withColumn("mta_tax", col("mta_tax").cast("double")) \
       .withColumn("tip_amount", col("tip_amount").cast("double")) \
       .withColumn("tolls_amount", col("tolls_amount").cast("double")) \
       .withColumn("ehail_fee", col("ehail_fee").cast("double")) \
       .withColumn("improvement_surcharge", col("improvement_surcharge").cast("double")) \
       .withColumn("total_amount", col("total_amount").cast("double")) \
       .withColumn("payment_type", col("payment_type").cast("integer")) \
       .withColumn("trip_type", col("trip_type").cast("integer")) \
       .withColumn("congestion_surcharge", col("congestion_surcharge").cast("double"))

df.printSchema()

df_cleaned = df.filter(
    (col("passenger_count") >= 1) & (col("passenger_count") <= 10) &  
    (col("payment_type").isNotNull())  
)

df_cleaned = df_cleaned.withColumn("ehail_fee", lit(0))

legal_rows_count = df_cleaned.count()

print(f"clean data:")

df_cleaned.show(5)
print(f"Number of legal rows: {legal_rows_count}")

In [1]:
%pyspark
from pyspark.sql.functions import col

df = df_cleaned.select('trip_distance', 'total_amount')

correlation = df.corr('trip_distance', 'total_amount')
print(f"Correlation between Trip Distance and Total Fare: {correlation}")



In [2]:
%pyspark

df = df_cleaned.select('passenger_count', 'total_amount')

fare_by_passenger_count = df.groupBy('passenger_count').avg('total_amount')

z.show(fare_by_passenger_count)


In [3]:
%pyspark

df = df_cleaned.select('payment_type')

payment_distribution = df.groupBy('payment_type').count()

z.show(payment_distribution)


In [4]:
%pyspark

df = df_cleaned.select('PULocationID')

pickup_counts = df.groupBy('PULocationID').count().orderBy('count', ascending=False).limit(20)

z.show(pickup_counts)


In [5]:
%pyspark

df = df_cleaned.select('PULocationID', 'DOLocationID')

route_counts = df.groupBy('PULocationID', 'DOLocationID').count().orderBy('count', ascending=False).limit(20)

z.show(route_counts)


In [6]:
%pyspark
from pyspark.sql.functions import unix_timestamp

df = df_cleaned.select('lpep_pickup_datetime', 'lpep_dropoff_datetime', 'total_amount')

df = df.withColumn('duration_minutes', (unix_timestamp('lpep_dropoff_datetime') - unix_timestamp('lpep_pickup_datetime')) / 60)

duration_fare = df.select('duration_minutes', 'total_amount')

z.show(duration_fare)


In [7]:
%pyspark

df = df_cleaned.select('PULocationID', 'DOLocationID')

location_counts = df.groupBy('PULocationID', 'DOLocationID').count()

z.show(location_counts)


In [8]:
%pyspark
from pyspark.sql.functions import date_format, col, sum, desc

df = df_cleaned.withColumn("month", date_format(col("lpep_pickup_datetime"), "yyyy-MM"))

monthly_revenue = df.groupBy("month").agg(sum("total_amount").alias("total_revenue"))

monthly_revenue = monthly_revenue.orderBy(desc("month"))

z.show(monthly_revenue)

In [9]:
%pyspark
from pyspark.sql.functions import count

location_heatmap = df_cleaned.groupBy("PULocationID", "DOLocationID").agg(count("*").alias("trip_count"))

location_heatmap = location_heatmap.orderBy(col("trip_count").desc())

z.show(location_heatmap)


In [10]:
%pyspark
from pyspark.sql.functions import hour, unix_timestamp

df = df_cleaned.withColumn("trip_duration", 
    (unix_timestamp("lpep_dropoff_datetime") - unix_timestamp("lpep_pickup_datetime")) / 60)

df = df.withColumn("pickup_hour", hour("lpep_pickup_datetime"))

hourly_duration = df.groupBy("pickup_hour").agg({"trip_duration": "avg"}).orderBy("pickup_hour")

z.show(hourly_duration)


In [11]:
%pyspark
from pyspark.sql.functions import when

df_fraud = df_cleaned.withColumn("fraud_flag", when((col("trip_distance") < 0.1) & (col("total_amount") > 20), "Overcharge") \
    .when((col("trip_distance") > 100) & (col("total_amount") < 10), "Undercharge") \
    .when((col("tip_amount") > col("total_amount") * 0.5), "Excessive Tip") \
    .otherwise("Normal"))

z.show(df_fraud.groupBy("fraud_flag").count())


In [12]:
%pyspark
from pyspark.sql.functions import unix_timestamp
from pyspark.ml.feature import VectorAssembler

# Convert date column to Unix timestamp (integer)
daily_revenue = daily_revenue.withColumn("date_numeric", unix_timestamp("date"))

# Assemble features
vec_assembler = VectorAssembler(inputCols=["date_numeric", "total_revenue"], outputCol="features")
df_features = vec_assembler.transform(daily_revenue)

z.show(df_features)


In [13]:
%pyspark
from pyspark.sql.functions import hour

df = df_cleaned.withColumn("hour_of_day", hour("lpep_pickup_datetime"))

hourly_demand = df.groupBy("hour_of_day").count().orderBy("hour_of_day")

z.show(hourly_demand)


In [14]:
%pyspark

df = df_cleaned.withColumn("price_per_mile", col("total_amount") / col("trip_distance"))

distance_pricing = df.groupBy("trip_distance").agg({"price_per_mile": "avg"}).orderBy("trip_distance")

z.show(distance_pricing)


In [15]:
%pyspark
from pyspark.sql.functions import dayofweek

df = df_cleaned.withColumn("day_of_week", dayofweek("lpep_pickup_datetime"))

weekly_demand = df.groupBy("day_of_week").count().orderBy("day_of_week")

z.show(weekly_demand)
