# Coversion rate for each brand from Cart & View

In [0]:
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, LongType, DoubleType

schema = StructType([
    StructField("event_time", TimestampType(), True),
    StructField("event_type", StringType(), True),
    StructField("product_id", LongType(), True),
    StructField("category_id", LongType(), True),
    StructField("category_code", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("user_id", LongType(), True),
    StructField("user_session", StringType(), True)
])

In [0]:
df_october = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",header="true",schema=schema)

df_november = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",header="true",schema=schema)

In [0]:
df_october.createOrReplaceTempView("df_october")
df_november.createOrReplaceTempView("df_november")

In [0]:
%sql
show tables

In [0]:
%sql

CREATE OR REPLACE TABLE workspace.ecommerce.df_october AS
SELECT * FROM df_october;
CREATE OR REPLACE TABLE workspace.ecommerce.df_november AS
SELECT * FROM df_november;

In [0]:
df_purchase = spark.sql("""
    SELECT * FROM df_november WHERE event_type = 'purchase' AND brand IS NOT NULL
    UNION ALL
    SELECT * FROM df_october WHERE event_type = 'purchase' AND brand IS NOT NULL
""")

    

In [0]:
df_view = spark.sql("""
    SELECT * FROM df_november WHERE event_type = 'view' AND brand IS NOT NULL
    UNION ALL
    SELECT * FROM df_october WHERE event_type = 'view' AND brand IS NOT NULL
""")

In [0]:
df_cart = spark.sql("""
    SELECT * FROM df_november WHERE event_type = 'cart' AND brand IS NOT NULL
    UNION ALL
    SELECT * FROM df_october WHERE event_type = 'cart' AND brand IS NOT NULL
""")

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

window_spec1 = Window.orderBy(desc("P_total_price"))
TopPurchase = df_purchase.groupBy("brand").agg(sum("price").alias("P_total_price")).withColumn("P_rank", dense_rank().over(window_spec1))

window_spec2 = Window.orderBy(desc("C_total_price"))
TopCart = df_cart.groupBy("brand").agg(sum("price").alias("C_total_price")).withColumn("C_rank", dense_rank().over(window_spec2))

window_spec3 = Window.orderBy(desc("V_total_price"))
TopView = df_view.groupBy("brand").agg(sum("price").alias("V_total_price")).withColumn("V_rank", dense_rank().over(window_spec3))


In [0]:
Top = (
    TopPurchase
    .join(TopCart, on="brand", how="left")
    .join(TopView, on="brand", how="left")
)
Top.printSchema

In [0]:
Top_final = Top.withColumn("Cart_conversionRate", (col("P_total_price") / col("C_total_price")) * 100) \
   .withColumn("View_conversionRate", (col("P_total_price") / col("V_total_price")) * 100) \
   .orderBy(col("P_rank")).limit(10)

Top 10 Brand based on high total revenue & their cart to purschase conversion & view to purshace conversion

In [0]:
Top10purchaser = Top_final.select("brand", "P_total_price", "P_rank", "Cart_conversionRate", "View_conversionRate").orderBy(col("P_rank")).limit(10)
display(Top10purchaser)

Top 10 brands who have high cart to purchase conversion rate

In [0]:
Top10CartConverser = Top_final.select("brand", "C_total_price", "C_rank", "Cart_conversionRate", "View_conversionRate").orderBy(col("Cart_conversionRate").desc()).limit(10)
display(Top10CartConverser)

Top 10 brands with high view to purchase conversion rate

In [0]:
Top10ViewConverser = Top_final.select("brand", "V_total_price", "V_rank", "Cart_conversionRate", "View_conversionRate").orderBy(col("View_conversionRate").desc()).limit(10)
display(Top10ViewConverser)

Flagging Brands with UDF

In [0]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf, col

def conversion_flag(p_rank, cart_rate, view_rate):
    if p_rank <= 3 and cart_rate >= 40 and view_rate >= 2:
        return "Top Performer"
    elif cart_rate >= 60:
        return "Strong Cart Converter"
    elif view_rate >= 2:
        return "Strong View Converter"
    else:
        return "Needs Improvement"

flag_udf = udf(conversion_flag, StringType())

# Add the Performance_Flag column
Top_final_flagged = Top_final.withColumn(
    "Performance_Flag",
    flag_udf(col("P_rank"), col("Cart_conversionRate"), col("View_conversionRate"))
)

# Now order by P_rank at the DataFrame level
Top_final_ordered = Top_final_flagged.orderBy(col("P_rank"))

display(
    Top_final_ordered.select(
        "brand",
        "P_total_price",
        "P_rank",
        "Cart_conversionRate",
        "View_conversionRate",
        "Performance_Flag"
    )
)