In [1]:
# âœ… Colab Spark Setup (Run this once)
!pip -q install pyspark

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder \
    .appName("Databricks-Day11-Task") \
    .getOrCreate()

print(" Spark Version:", spark.version)

 Spark Version: 4.0.1


In [2]:
# ================================
# Day 11 - Statistical Analysis & ML Prep (PySpark)
# ================================

# If you already have `events` dataframe, skip creating it.
# Otherwise create a dummy sample dataset for testing:
try:
    events
    print("'events' DataFrame found.")
except NameError:
    print("'events' DataFrame not found. Creating a sample dataset...")
    sample_data = [
        (1, "2026-01-10", "2026-01-10 10:30:00", "view", 500.0, 0.02),
        (1, "2026-01-10", "2026-01-10 10:35:00", "click", 500.0, 0.03),
        (2, "2026-01-11", "2026-01-11 12:00:00", "purchase", 1000.0, 0.08),
        (3, "2026-01-12", "2026-01-12 09:00:00", "view", 700.0, 0.01),
        (3, "2026-01-13", "2026-01-13 09:05:00", "click", 700.0, 0.02),
    ]
    cols = ["user_id", "event_date", "event_time", "event_type", "price", "conversion_rate"]
    events = spark.createDataFrame(sample_data, cols)

# Make sure date/time columns are correct types
events = events.withColumn("event_date", F.to_date("event_date")) \
               .withColumn("event_time", F.to_timestamp("event_time"))

print("\n Schema:")
events.printSchema()

print("\n Sample data:")
events.show(5, truncate=False)

# --------------------------------
# 1)  Descriptive Statistics
# --------------------------------
print("\n====================")
print("1)  Descriptive Stats for price")
print("====================")
events.select("price").describe().show()

# You can also do richer stats using agg:
print("\n More stats (mean, stddev, min, max):")
events.agg(
    F.mean("price").alias("mean_price"),
    F.stddev("price").alias("std_price"),
    F.min("price").alias("min_price"),
    F.max("price").alias("max_price")
).show()

# --------------------------------
# 2)  Hypothesis test prep: Weekday vs Weekend
# --------------------------------
print("\n====================")
print("2)  Weekday vs Weekend (group counts)")
print("====================")

# Spark dayofweek: 1=Sunday ... 7=Saturday
events2 = events.withColumn(
    "is_weekend",
    F.when(F.dayofweek("event_date").isin([1, 7]), F.lit(1)).otherwise(F.lit(0))
)

events2.groupBy("is_weekend", "event_type").count().orderBy("is_weekend", "event_type").show()

# If conversion_rate exists, compare avg conversion_rate weekday vs weekend:
if "conversion_rate" in events2.columns:
    print("\n Avg conversion_rate: Weekday vs Weekend")
    events2.groupBy("is_weekend").agg(
        F.count("*").alias("rows"),
        F.avg("conversion_rate").alias("avg_conversion_rate")
    ).show()
else:
    print("\n 'conversion_rate' column not found. Skipping avg comparison.")

# --------------------------------
# 3) Correlations
# --------------------------------
print("\n====================")
print("3) Correlation")
print("====================")

if "conversion_rate" in events2.columns:
    corr_val = events2.stat.corr("price", "conversion_rate")
    print(f" Corr(price, conversion_rate) = {corr_val}")
else:
    print("'conversion_rate' column not found. Can't compute Corr(price, conversion_rate).")

# Optional: correlation matrix for numeric cols
numeric_cols = [c for c, t in events2.dtypes if t in ["int", "double", "float", "bigint"]]
print("\n Numeric columns found:", numeric_cols)

# --------------------------------
# 4) Feature Engineering for ML
# --------------------------------
print("\n====================")
print("4) Feature Engineering for ML")
print("====================")

# Window for first event time per user (for time_since_first_view)
w = Window.partitionBy("user_id").orderBy("event_time")

features = events2 \
    .withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_date")) \
    .withColumn("price_log", F.log(F.col("price") + F.lit(1))) \
    .withColumn("first_event_time", F.first("event_time").over(w)) \
    .withColumn("time_since_first_event_sec",
                (F.unix_timestamp("event_time") - F.unix_timestamp("first_event_time")).cast("long")
               ) \
    .drop("first_event_time")

print("\n Engineered features preview:")
features.select(
    "user_id", "event_date", "event_time", "event_type",
    "is_weekend", "hour", "day_of_week", "price", "price_log", "time_since_first_event_sec"
).show(10, truncate=False)

print("\n Done Task 11 Successfully")


'events' DataFrame not found. Creating a sample dataset...

 Schema:
root
 |-- user_id: long (nullable = true)
 |-- event_date: date (nullable = true)
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- price: double (nullable = true)
 |-- conversion_rate: double (nullable = true)


 Sample data:
+-------+----------+-------------------+----------+------+---------------+
|user_id|event_date|event_time         |event_type|price |conversion_rate|
+-------+----------+-------------------+----------+------+---------------+
|1      |2026-01-10|2026-01-10 10:30:00|view      |500.0 |0.02           |
|1      |2026-01-10|2026-01-10 10:35:00|click     |500.0 |0.03           |
|2      |2026-01-11|2026-01-11 12:00:00|purchase  |1000.0|0.08           |
|3      |2026-01-12|2026-01-12 09:00:00|view      |700.0 |0.01           |
|3      |2026-01-13|2026-01-13 09:05:00|click     |700.0 |0.02           |
+-------+----------+-------------------+----------+------+----