In [3]:

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, TimestampType
import numpy as np
from faker import Faker
from datetime import timedelta
spark = SparkSession.builder.appName("TripDataGeneration").getOrCreate()
fake = Faker()


In [5]:
schema = StructType([
    StructField("TripID", IntegerType(), False),
    StructField("StartTime", TimestampType(), False),
    StructField("EndTime", TimestampType(), False),
    StructField("Distance", FloatType(), False),
    StructField("Fare", FloatType(), False)
])

In [8]:

num_records = 1_000_000
rdd = spark.sparkContext.parallelize(range(num_records)).map(
    lambda i: (
        i,
        fake.date_time_between(start_date="-2d", end_date="now"),  # StartTime
        random.randint(5, 120),  # Duration (generated inside map)
        round(random.uniform(1, 50), 2),  # Distance
    )
)

rdd = rdd.map(lambda x: (
    x[0],
    x[1],
    x[1] + timedelta(minutes=x[2]),
    x[3],
    round(x[3] * random.uniform(5, 15), 2)
))

df = spark.createDataFrame(rdd, schema=schema)
df.show(5)


+------+--------------------+--------------------+--------+------+
|TripID|           StartTime|             EndTime|Distance|  Fare|
+------+--------------------+--------------------+--------+------+
|     0|2025-03-05 00:11:...|2025-03-05 00:45:...|    31.3|287.33|
|     1|2025-03-05 11:23:...|2025-03-05 12:07:...|   39.91|290.61|
|     2|2025-03-04 10:32:...|2025-03-04 12:14:...|    8.26|113.93|
|     3|2025-03-06 02:43:...|2025-03-06 04:33:...|   28.77|352.48|
|     4|2025-03-04 15:32:...|2025-03-04 16:48:...|     9.6| 76.87|
+------+--------------------+--------------------+--------+------+
only showing top 5 rows



In [9]:
from pyspark.sql import functions as F
df = df.withColumn("TripDuration", (F.unix_timestamp(F.col("EndTime")) - F.unix_timestamp(F.col("StartTime"))) / 60)
df.show(5)


+------+--------------------+--------------------+--------+------+------------+
|TripID|           StartTime|             EndTime|Distance|  Fare|TripDuration|
+------+--------------------+--------------------+--------+------+------------+
|     0|2025-03-05 00:17:...|2025-03-05 02:10:...|   13.65| 75.46|       113.0|
|     1|2025-03-05 11:30:...|2025-03-05 12:17:...|   11.42|114.66|        47.0|
|     2|2025-03-04 10:38:...|2025-03-04 12:38:...|   48.54|313.38|       120.0|
|     3|2025-03-06 02:50:...|2025-03-06 04:26:...|   14.66|114.37|        96.0|
|     4|2025-03-04 15:38:...|2025-03-04 15:49:...|    7.09|  60.5|        11.0|
+------+--------------------+--------------------+--------+------+------------+
only showing top 5 rows



In [11]:
df_p = df.select((F.col("Fare") / F.col("Distance")).alias("FarePerMile"))
df_p.show(5)



+------------------+
|       FarePerMile|
+------------------+
|11.383312959918419|
|12.235293558931966|
| 6.003118574532474|
|11.995461318299618|
|14.341986977286082|
+------------------+
only showing top 5 rows



In [14]:
Longest=df.select('TripDuration').orderBy(F.col('TripDuration').desc()).limit(3)
Longest.show()

+------------+
|TripDuration|
+------------+
|       120.0|
|       120.0|
|       120.0|
+------------+



In [15]:
LongestByDistance = df.select('TripID', 'Distance', 'TripDuration').orderBy(F.col('Distance').desc()).limit(3)
LongestByDistance.show()


+------+--------+------------+
|TripID|Distance|TripDuration|
+------+--------+------------+
|525912|    50.0|       101.0|
| 25374|    50.0|        17.0|
|529100|    50.0|        53.0|
+------+--------+------------+



In [18]:
# Extract hour from StartTime and group by hour
trips_per_hour = df.withColumn("HourOfDay", F.hour(F.col("StartTime"))) .groupBy("HourOfDay") .agg(F.count("TripID").alias("TotalTrips"))
trips_per_hour.show(5)


+---------+----------+
|HourOfDay|TotalTrips|
+---------+----------+
|       12|     41432|
|       22|     41600|
|        1|     41629|
|       13|     41291|
|       16|     41899|
+---------+----------+
only showing top 5 rows



In [26]:
import plotly.express as px

# Create a line chart for Hours vs. Total Trips
fig = px.line(trips_per_hour,
              x="HourOfDay",
              y="TotalTrips",
              title="Trips Per Hour of the Day",
              labels={"HourOfDay": "Hour of Day", "TotalTrips": "Total Number of Trips"},
              markers=True)

# Show the line chart
fig.show()
