In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("sales").getOrCreate()
schema = ["customer_id","join_date","region"]
data = [("C1001","2023-01-05","East"),
("C1002","2023-03-12","West"),
("C1003","2023-07-25","East"),
("C1004","2024-02-01","West"),
("C1005","2024-05-30","Central")]
cust=spark.createDataFrame(data,schema)
cust.show()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType
from pyspark.sql import functions as F

# 1. Create SparkSession (if not already running)
spark = SparkSession.builder.appName("MockDataCreation").getOrCreate()

# 2. Define the raw data as a list of tuples
raw_sales_data = [
    ("T100", "C1001", "50.00", "2024-01-10 10:00:00"),
    ("T101", "C1002", "125.50", "2024-01-15 14:30:00"),
    ("T102", "C1001", "20.00", "2024-02-01 09:15:00"),
    ("T103", "C1003", "300.00", "2024-03-20 18:45:00"),
    ("T104", "C1002", "75.00", "2024-04-05 11:20:00"),
    ("T105", "C1001", "15.00", "2024-09-01 16:00:00"),
    ("T106", "C1004", "5.50", "2024-09-05 08:30:00"),
    ("T107", "C1003", "450.00", "2024-09-10 12:00:00"),
    ("T108", "C1005", "1000.00", "2024-10-01 13:10:00"),
    ("T109", "C1001", "10.00", "2024-10-10 17:00:00"),
    ("T110", "C1002", "20.00", "2024-10-14 09:00:00")
]

# 3. Define the desired schema, explicitly casting 'amount' and 'timestamp'
# This ensures data types are correct for later calculations.
sales_schema = StructType([
    StructField("transaction_id", StringType(), False),
    StructField("customer_id", StringType(), False),
    StructField("amount", DoubleType(), True),         # Correctly cast to Double
    StructField("timestamp", TimestampType(), True)    # Correctly cast to Timestamp
])

# 4. Create the DataFrame
# We use F.to_timestamp to convert the string date to TimestampType during creation
sales_df = spark.createDataFrame(
    raw_sales_data,
    ['transaction_id', 'customer_id', 'amount_str', 'timestamp_str'] # Use temporary string column names
).withColumn("amount", F.col("amount_str").cast(DoubleType())) \
 .withColumn("timestamp", F.to_timestamp(F.col("timestamp_str"))) \
 .select("transaction_id", "customer_id", "amount", "timestamp") # Select final, correctly cast columns

# 5. Show the resulting DataFrame and its schema
print("Mock Sales DataFrame:")
sales_df.show()
print("Schema:")
sales_df.printSchema()

In [0]:
from pyspark.sql import functions as f
Today = f.to_date(f.lit("2025-10-14"))

In [0]:
customer_metrics_df = sales_df.groupBy("customer_id").agg(f.sum("amount").alias("Total_sales"),f.countDistinct("transaction_id").alias("Total_orders"))
customer_metrics_df.display()

In [0]:
customer_metrics_df=customer_metrics_df.withColumn("avg_tkt_value",f.col("Total_sales")/f.col("Total_orders"))

In [0]:
customer_metrics_df.display()

In [0]:
recency_df = sales_df.groupBy("customer_id").agg(f.max("timestamp").alias("last_purchase"))
recency_df.display()
recency_df=recency_df.withColumn("recency",f.datediff(Today,f.col("last_purchase")))
recency_df.display()

In [0]:
final_metrics_df = customer_metrics_df.join(recency_df, on="customer_id", how="inner")
final_metrics_df.display()

In [0]:
customer_final_df = final_metrics_df.join(cust, on="customer_id", how="left")
customer_final_df.display()

In [0]:
from pyspark.sql import Window
window_spec = Window.partitionBy("region").orderBy(f.desc("Total_sales"))
ranked_df = customer_final_df.withColumn("rank",f.rank().over(window_spec))
ranked_df.display()

In [0]:
from pyspark.sql.functions import col
segmented_df = ranked_df.withColumn("topCustomer", f.when(f.col("rank") <= 0.1*f.count("*").over(Window.partitionBy("region")), "Yes").otherwise("No"))
segmented_df.display()


In [0]:
segmented_df = segmented_df.withColumn(
    "is_top_clv_customer",
    F.when(F.col("rank") <= 0.1 * F.count("*").over(Window.partitionBy("region")), "Yes").otherwise("No")
)
segmented_df.display()