In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import array_contains

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
spark = SparkSession.builder.appName("recsys").getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
bucket = "recsys-aws"
key_transaction_matrix_prefix = "silver_data/transaction_matrix/"
transactions_df = spark.read.csv(f"s3://{bucket}/{key_transaction_matrix_prefix}", 
                                 header=True, 
                                 inferSchema=True)
transactions_rdd = transactions_df.rdd.map(lambda row: [adId for adId, value in row.asDict().items() if value == 1])
transactions_list_df = transactions_rdd.map(lambda items: (items, )).toDF(["items"])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
fp_growth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.1)
model = fp_growth.fit(transactions_list_df)
frequent_itemsets = model.freqItemsets
frequent_itemsets.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+----+
|               items|freq|
+--------------------+----+
|[1a47c3ae-0788-44...|   1|
|[1a47c3ae-0788-44...|   1|
|[4884cea3-1b2a-4c...|   1|
|[a333ee96-16b4-44...|   1|
|[a333ee96-16b4-44...|   1|
|[c43a2d97-4ff4-49...|   1|
|[5966a084-dca0-4c...|   2|
|[5966a084-dca0-4c...|   1|
|[552bfdf5-f621-4f...|   1|
|[552bfdf5-f621-4f...|   1|
|[8ede340e-4918-46...|   1|
|[8ede340e-4918-46...|   1|
|[be315b90-6d33-48...|   2|
|[64dead02-32bd-44...|   1|
|[508d1adf-7724-44...|   2|
|[a745943c-7a7b-4d...|   1|
|[b17562a6-0fbd-47...|   1|
|[f5861e5d-db24-4b...|   2|
|[f5861e5d-db24-4b...|   1|
|[6fd47814-93f5-47...|   1|
+--------------------+----+
only showing top 20 rows

In [6]:
association_rules = model.associationRules
association_rules.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+--------------------+------------------+------------------+-------+
|          antecedent|          consequent|        confidence|              lift|support|
+--------------------+--------------------+------------------+------------------+-------+
|[e1a31fff-bb12-4c...|[a16f04da-911c-41...|               0.5|              50.0|   0.01|
|[e1a31fff-bb12-4c...|[61a75039-9bb8-4b...|               0.5|              50.0|   0.01|
|[d5f4d7ef-ac83-48...|[12f62edf-6466-49...|               1.0|              50.0|   0.01|
|[6fd47814-93f5-47...|[6c823347-b87a-42...|               1.0|              50.0|   0.01|
|[2fe641d4-3a0a-48...|[8ede340e-4918-46...|               1.0|             100.0|   0.01|
|[3e7b8023-e74d-47...|[5773d5ba-f4ed-48...|               1.0|             100.0|   0.01|
|[6eebd0f0-a308-45...|[08dd6cc5-bcfe-4d...|               0.5|              50.0|   0.01|
|[5687c65d-18e5-43...|[5304773f-c233-43...|               0.5|16.666666666666668|   0.01|
|[5687c65d

In [7]:
def recommend_top10_ads(adId):
    recommendations = association_rules.filter(array_contains(association_rules.antecedent, adId)).orderBy("confidence", ascending=False).limit(10)
    return recommendations.select("consequent")

adId = "f48ed980-d1ed-4c41-b676-c951dccc3e50"
top_10_recommendations = recommend_top10_ads(adId)
top_10_recommendations.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|          consequent|
+--------------------+
|[552bfdf5-f621-4f...|
|[286c2cf4-aeda-42...|
+--------------------+