In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, round, mean

# Initialize Spark session

In [3]:
spark = SparkSession.builder \
    .appName("Taxi Data Analysis") \
    .getOrCreate()

# Load data into DataFrame

In [6]:
df = spark.read.csv("data/data_reports_monthly.csv", header=True, inferSchema=True)

# Show schema and sample data

In [7]:
df.printSchema()
df.show(5)

root
 |-- Month/Year: timestamp (nullable = true)
 |-- License Class: string (nullable = true)
 |-- Trips Per Day: string (nullable = true)
 |-- Farebox Per Day: string (nullable = true)
 |-- Unique Drivers: string (nullable = true)
 |-- Unique Vehicles: string (nullable = true)
 |-- Vehicles Per Day: string (nullable = true)
 |-- Avg Days Vehicles on Road: double (nullable = true)
 |-- Avg Hours Per Day Per Vehicle: double (nullable = true)
 |-- Avg Days Drivers on Road: double (nullable = true)
 |-- Avg Hours Per Day Per Driver: double (nullable = true)
 |-- Avg Minutes Per Trip: double (nullable = true)
 |-- Percent of Trips Paid with Credit Card: string (nullable = true)
 |-- Trips Per Day Shared: string (nullable = true)

+-------------------+-----------------+-------------+---------------+--------------+---------------+----------------+-------------------------+-----------------------------+------------------------+----------------------------+--------------------+---------------

# Data Cleaning: Handle missing values and correct data types

In [8]:
df = df.na.drop()
df = df.withColumn("Trips Per Day", col("Trips Per Day").cast("float"))

# Exploratory Data Analysis

In [9]:
df.describe().show()

+-------+---------------+------------------+---------------+------------------+------------------+------------------+-------------------------+-----------------------------+------------------------+----------------------------+--------------------+--------------------------------------+--------------------+
|summary|  License Class|     Trips Per Day|Farebox Per Day|    Unique Drivers|   Unique Vehicles|  Vehicles Per Day|Avg Days Vehicles on Road|Avg Hours Per Day Per Vehicle|Avg Days Drivers on Road|Avg Hours Per Day Per Driver|Avg Minutes Per Trip|Percent of Trips Paid with Credit Card|Trips Per Day Shared|
+-------+---------------+------------------+---------------+------------------+------------------+------------------+-------------------------+-----------------------------+------------------------+----------------------------+--------------------+--------------------------------------+--------------------+
|  count|            761|                22|            761|             

# Analysis: Calculate average trips and fare per license class

In [10]:
avg_trips = df.groupBy("License Class").agg(avg("Trips Per Day").alias("Average Trips"))
avg_trips.show()

+-----------------+-----------------+
|    License Class|    Average Trips|
+-----------------+-----------------+
|  FHV - Black Car|             NULL|
|FHV - High Volume|             NULL|
|            Green|            283.0|
|     FHV - Livery|             NULL|
|           Yellow|             NULL|
|   FHV - Lux Limo|623.8095238095239|
+-----------------+-----------------+

