In [0]:
df1 = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")



In [0]:
df1.show(5)

In [0]:
 rows = df1.count()
 columns = len(df1.columns)

 print("There are {} rows and {} columns in the dataset.".format(rows, columns))

In [0]:
df2 = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv")

In [0]:
df2.show(5)

In [0]:
 rows = df2.count()
 columns = len(df1.columns)

 print("There are {} rows and {} columns in the dataset.".format(rows, columns))

### Repeat customers in November from October

In [0]:
from pyspark.sql.functions import col
df1_u = df1.filter(col('event_type') == 'purchase').select("user_id").distinct()
df2_u = df2.filter(col('event_type') == 'purchase').select("user_id").distinct()

repeat_customers = df1_u.join(df2_u, "user_id", "inner")


print("There are {} repeat customers.".format(repeat_customers.count()))



In [0]:
## Concatenate DataFrames ROW-WISE
df = df1.unionByName(df2)

rows = df.count()
columns = len(df.columns)

print("There are {} rows and {} columns in the dataset.".format(rows, columns))


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

window = Window.partitionBy("user_id").orderBy("event_time")

df.withColumn("cumulative_events", F.count('*').over(window)).filter(col('cumulative_events') == 1).show(5)

### Created Product Category column from category_code

In [0]:
def get_final_category(category_code):
    if category_code is None:
        return None
    return category_code.split(".")[-1]


In [0]:
final_category_udf = udf(get_final_category, StringType())

df = df1.withColumn(
    "product_category",
    final_category_udf(col("category_code"))
)


### Top 10 Product category by higher conversion rate.

In [0]:
from pyspark.sql.functions import col, count, round
# Conversion rate by category
df.groupBy("product_category", "event_type").count() \
    .groupBy("product_category").pivot("event_type").sum("count") \
    .withColumn("conversion_rate", round(F.col("purchase")/F.col("view")*100,2)).orderBy(col('conversion_rate'), ascending=False).show(10)
