# Create a Sample Dataset

In [0]:
from pyspark.sql import Row
from pyspark.sql import functions as F
from datetime import datetime, timedelta
import random

data = []

for i in range(1, 501):
    data.append(Row(
        order_id=i,
        user_id=random.randint(1, 50),
        price=random.randint(100, 2000),
        quantity=random.randint(1, 5),
        event_ts=datetime.now() - timedelta(days=random.randint(0, 30))
    ))

events = spark.createDataFrame(data)
events.show(5)

+--------+-------+-----+--------+--------------------+
|order_id|user_id|price|quantity|            event_ts|
+--------+-------+-----+--------+--------------------+
|       1|      8|  383|       1|2026-01-17 14:25:...|
|       2|     36|  956|       1|2026-01-09 14:25:...|
|       3|     41|  631|       4|2026-01-08 14:25:...|
|       4|     38|  346|       2|2026-01-08 14:25:...|
|       5|     49|  727|       5|2026-01-07 14:25:...|
+--------+-------+-----+--------+--------------------+
only showing top 5 rows


# Descriptive Statistics

In [0]:
events.describe(["price", "quantity"]).show()

+-------+-----------------+------------------+
|summary|            price|          quantity|
+-------+-----------------+------------------+
|  count|              500|               500|
|   mean|          1066.59|             3.012|
| stddev|553.0466626987677|1.3429307226553808|
|    min|              109|                 1|
|    max|             1998|                 5|
+-------+-----------------+------------------+



# Hypothesis Testing (Weekday vs Weekend)

In [0]:
events = events.withColumn(
    "is_weekend",
    F.dayofweek("event_ts").isin([1, 7])
)

events.groupBy("is_weekend") \
      .agg(
          F.count("*").alias("total_orders"),
          F.avg("price").alias("avg_price")
      ).show()


+----------+------------+------------------+
|is_weekend|total_orders|         avg_price|
+----------+------------+------------------+
|      true|         167|1059.3473053892214|
|     false|         333|1070.2222222222222|
+----------+------------+------------------+



In [0]:
events.stat.corr("price", "quantity")


0.03391029599092305

In [0]:
events = events.withColumn("hour", F.hour("event_ts")) \
               .withColumn("day_of_week", F.dayofweek("event_ts"))


In [0]:
events.describe(["price", "quantity"]).show()

+-------+-----------------+------------------+
|summary|            price|          quantity|
+-------+-----------------+------------------+
|  count|              500|               500|
|   mean|          1066.59|             3.012|
| stddev|553.0466626987677|1.3429307226553808|
|    min|              109|                 1|
|    max|             1998|                 5|
+-------+-----------------+------------------+



In [0]:
events = events.withColumn(
    "order_value",
    F.col("price") * F.col("quantity")
)


# Advanced Descriptive Analysis

In [0]:
events.show(5)


+--------+-------+-----+--------+--------------------+----------+----+-----------+-----------+
|order_id|user_id|price|quantity|            event_ts|is_weekend|hour|day_of_week|order_value|
+--------+-------+-----+--------+--------------------+----------+----+-----------+-----------+
|       1|      8|  383|       1|2026-01-17 14:25:...|      true|  14|          7|        383|
|       2|     36|  956|       1|2026-01-09 14:25:...|     false|  14|          6|        956|
|       3|     41|  631|       4|2026-01-08 14:25:...|     false|  14|          5|       2524|
|       4|     38|  346|       2|2026-01-08 14:25:...|     false|  14|          5|        692|
|       5|     49|  727|       5|2026-01-07 14:25:...|     false|  14|          4|       3635|
+--------+-------+-----+--------+--------------------+----------+----+-----------+-----------+
only showing top 5 rows


In [0]:
events.groupBy(
    F.when(F.col("price") < 500, "Low")
     .when(F.col("price") < 1200, "Medium")
     .otherwise("High")
     .alias("price_bucket")
).count().show()


+------------+-----+
|price_bucket|count|
+------------+-----+
|         Low|  104|
|      Medium|  178|
|        High|  218|
+------------+-----+



# Daily Trend Analysis

In [0]:
daily_trend = events.groupBy(
    F.to_date("event_ts").alias("order_date")
).agg(
    F.count("*").alias("orders"),
    F.sum("order_value").alias("daily_revenue")
)

daily_trend.orderBy("order_date").show()


+----------+------+-------------+
|order_date|orders|daily_revenue|
+----------+------+-------------+
|2025-12-21|    18|        46767|
|2025-12-22|    10|        29545|
|2025-12-23|    18|        54876|
|2025-12-24|    11|        42333|
|2025-12-25|    16|        39536|
|2025-12-26|    13|        46383|
|2025-12-27|    26|        68112|
|2025-12-28|    18|        72753|
|2025-12-29|    14|        45054|
|2025-12-30|     7|        20633|
|2025-12-31|    22|        74028|
|2026-01-01|    14|        33378|
|2026-01-02|    13|        52618|
|2026-01-03|    18|        44631|
|2026-01-04|    14|        46953|
|2026-01-05|    20|        67986|
|2026-01-06|    15|        45992|
|2026-01-07|    19|        64636|
|2026-01-08|    19|        64465|
|2026-01-09|    20|        71921|
+----------+------+-------------+
only showing top 20 rows


# Hypothesis Testing

In [0]:
events.groupBy("is_weekend") \
      .agg(
          F.avg("order_value").alias("avg_order_value"),
          F.max("order_value").alias("max_order_value")
      ).show()


+----------+-----------------+---------------+
|is_weekend|  avg_order_value|max_order_value|
+----------+-----------------+---------------+
|      true|3151.365269461078|           9975|
|     false|3281.003003003003|           9870|
+----------+-----------------+---------------+



# User Behavior Analysis

In [0]:
user_orders = events.groupBy("user_id") \
    .agg(
        F.count("*").alias("total_orders"),
        F.sum("order_value").alias("lifetime_value")
    )

user_orders.show(5)


+-------+------------+--------------+
|user_id|total_orders|lifetime_value|
+-------+------------+--------------+
|      8|          12|         46728|
|     36|           9|         29191|
|     41|           8|         27276|
|     38|           5|         10240|
|     49|          13|         40160|
+-------+------------+--------------+
only showing top 5 rows


# Time-Based Behavioral Features

In [0]:
from pyspark.sql.window import Window

w = Window.partitionBy("user_id").orderBy("event_ts")

events = events.withColumn(
    "time_since_last_order",
    F.unix_timestamp("event_ts") -
    F.unix_timestamp(F.lag("event_ts").over(w))
)


# Outlier Detection

In [0]:
q1, q3 = events.approxQuantile("order_value", [0.25, 0.75], 0.05)
iqr = q3 - q1

events.filter(
    (F.col("order_value") < q1 - 1.5 * iqr) |
    (F.col("order_value") > q3 + 1.5 * iqr)
).show()


+--------+-------+-----+--------+--------------------+----------+----+-----------+-----------+---------------------+
|order_id|user_id|price|quantity|            event_ts|is_weekend|hour|day_of_week|order_value|time_since_last_order|
+--------+-------+-----+--------+--------------------+----------+----+-----------+-----------+---------------------+
|     202|      1| 1974|       5|2026-01-09 14:25:...|     false|  14|          6|       9870|               172800|
|     163|      4| 1946|       5|2026-01-09 14:25:...|     false|  14|          6|       9730|                86400|
|     239|      5| 1761|       5|2026-01-14 14:25:...|     false|  14|          4|       8805|               691200|
|     186|      8| 1856|       5|2026-01-17 14:25:...|      true|  14|          7|       9280|                    0|
|     246|      9| 1978|       5|2025-12-27 14:25:...|      true|  14|          7|       9890|               172800|
|     252|     13| 1995|       5|2026-01-18 14:25:...|      true