In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, round, mean

# Initialize Spark session

In [9]:
spark = SparkSession.builder \
    .appName("Taxi Data Analysis") \
    .getOrCreate()

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

# Load data into DataFrame

In [5]:
df = spark.read.csv("data/data_reports_monthly.csv", header=True, inferSchema=True)

NameError: name 'spark' is not defined

# Show schema and sample data

In [None]:
df.printSchema()
df.show(5)

# Data Cleaning: Handle missing values and correct data types

In [None]:
df = df.na.drop()
df = df.withColumn("Trips Per Day", col("Trips Per Day").cast("float"))

# Exploratory Data Analysis

In [None]:
df.describe().show()

# Analysis: Calculate average trips and fare per license class

In [None]:
avg_trips = df.groupBy("License Class").agg(avg("Trips Per Day").alias("Average Trips"))
avg_trips.show()

In [None]:
avg_fare_per_trip = df.withColumn("Fare Per Trip", col("Farebox Per Day") / col("Trips Per Day"))
avg_fare_per_trip.groupBy("License Class").agg(mean("Fare Per Trip").alias("Average Fare Per Trip")).show()

# Data Visualization (using Pandas and Matplotlib)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

pandas_df = avg_trips.toPandas()
pandas_df.plot(kind='bar', x='License Class', y='Average Trips')
plt.title('Average Trips Per Day by License Class')
plt.show()