In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, desc, max as max_fn, min as min_fn, explode
from pyspark.sql.functions import udf, lit, array, struct, row_number
from pyspark.sql.types import DoubleType, StringType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import array_contains
from pyspark.sql.window import Window

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
spark = SparkSession.builder.appName("recsys").getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Content Based

In [4]:
bucket = "recsys-aws"
key_ads_content_prefix = "silver_data/ads_content_transformed/"
key_user_ad_matrix_prefix = "silver_data/user_ad_matrix/"

ads_content_df = spark.read.option("multiline", "true").csv(
    f"s3://{bucket}/{key_ads_content_prefix}",
    header=True,
    inferSchema=True,
    quote='"', 
    escape='"'
)
user_ad_interactions_df = spark.read.csv(f"s3://{bucket}/{key_user_ad_matrix_prefix}",
                                         header=True,
                                         inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
tokenizer = Tokenizer(inputCol="content_combined", outputCol="words")
ads_content_df = tokenizer.transform(ads_content_df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
tf = hashingTF.transform(ads_content_df)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(tf)
tfidf_matrix = idfModel.transform(tf)
tfidf_matrix.cache()

def get_user_profile(user_id, interactions_df, tfidf_matrix):
    user_interactions = interactions_df.filter(col("userId") == user_id).collect()[0].asDict()
    del user_interactions["userId"]
    interacted_ads = [ad for ad, score in user_interactions.items() if float(score) > 0]
    user_tfidf = tfidf_matrix.filter(col("adId").isin(interacted_ads)) 
    avg_vector = Vectors.dense([0] * tfidf_matrix.select("features").limit(1).collect()[0].features.size)
    for row in user_tfidf.collect():
        avg_vector += row.features
    avg_vector /= len(interacted_ads)
    return avg_vector

def content_based_recommendation(user_id, user_interactions_df, tfidf_matrix, top_n=10):
    user_profile = get_user_profile(user_id, user_interactions_df, tfidf_matrix)
    if user_profile is None:
        return None
    dot_product = udf(lambda x: float(x.dot(user_profile)), DoubleType())
    similarity_scores = tfidf_matrix.withColumn("similarity_score", dot_product(col("features")))
    max_score = similarity_scores.agg(max_fn("similarity_score")).collect()[0][0]
    min_score = similarity_scores.agg(min_fn("similarity_score")).collect()[0][0]
    normalize = udf(lambda x: (x - min_score) / (max_score - min_score) if max_score != min_score else 0.5, DoubleType())
    similarity_scores = similarity_scores.withColumn("normalized_similarity", normalize(col("similarity_score")))
    top_ads = (similarity_scores
               .orderBy(desc("normalized_similarity"))
               .limit(top_n)
               .select("adId", "normalized_similarity"))
    return top_ads

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
user_id_to_recommend = "4f3aecdc-f7d8-4718-925c-96d81c3765f3"
recommended_ads_df = content_based_recommendation(user_id_to_recommend, user_ad_interactions_df, tfidf_matrix)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Collaborative Filtering

In [7]:
bucket = "recsys-aws"
key_user_ad_matrix_prefix = "silver_data/user_ad_matrix/"
user_ad_interactions_df = spark.read.csv(f"s3://{bucket}/{key_user_ad_matrix_prefix}",
                                         header=True,
                                         inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
cols = [c for c in user_ad_interactions_df.columns if c != "userId"]
long_format = (user_ad_interactions_df
               .withColumn("adId_rating", explode(array([struct(col(c).alias("rating"), lit(c).alias("adId")) for c in cols])))
               .select("userId", "adId_rating.adId", "adId_rating.rating"))

user_indexer = StringIndexer(inputCol="userId", outputCol="userId_indexed")
ad_indexer = StringIndexer(inputCol="adId", outputCol="adId_indexed")
user_model = user_indexer.fit(long_format)
adId_model = ad_indexer.fit(long_format)
long_format = user_model.transform(long_format)
long_format = adId_model.transform(long_format)

(training, test) = long_format.randomSplit([0.8, 0.2])

als = ALS(
    maxIter=10, 
    regParam=0.01, 
    userCol="userId_indexed", 
    itemCol="adId_indexed", 
    ratingCol="rating", 
    coldStartStrategy="drop"
)

model = als.fit(training)

user_id_to_recommend = "4f3aecdc-f7d8-4718-925c-96d81c3765f3"
n_recommendations = 10
user_indexed = user_model.transform(spark.createDataFrame([(user_id_to_recommend,)], ["userId"]))
recs = model.recommendForUserSubset(user_indexed, n_recommendations)
recs = recs.withColumn("adId_indexed", explode(col("recommendations.adId_indexed")))

def index_to_id(index):
    return ad_id_labels[int(index)]

ad_id_labels = adId_model.labels
index_to_id_udf = udf(index_to_id, StringType())
recs_with_original_ids = recs.withColumn("original_adId", index_to_id_udf("adId_indexed"))

exploded_recommendations = recs_with_original_ids.select("userId_indexed", explode("recommendations").alias("recommendation"))
exploded_adId_indexed = exploded_recommendations.select("userId_indexed", "recommendation.adId_indexed")

labels_df = spark.createDataFrame([(i, label) for i, label in enumerate(ad_id_labels)], ["index", "label"])
final_recommended_adIds = exploded_adId_indexed.join(labels_df, exploded_adId_indexed.adId_indexed == labels_df.index)
final_result = final_recommended_adIds.select("userId_indexed", "label").withColumnRenamed("label", "recommended_adId")
final_result = final_result.dropDuplicates(['userId_indexed', 'recommended_adId'])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Market Basket Analysis

In [9]:
bucket = "recsys-aws"
key_transaction_matrix_prefix = "silver_data/transaction_matrix/"
transactions_df = spark.read.csv(f"s3://{bucket}/{key_transaction_matrix_prefix}", 
                                 header=True, 
                                 inferSchema=True)
transactions_rdd = transactions_df.rdd.map(lambda row: [adId for adId, value in row.asDict().items() if value == 1])
transactions_list_df = transactions_rdd.map(lambda items: (items, )).toDF(["items"])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
fp_growth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.1)
model = fp_growth.fit(transactions_list_df)
frequent_itemsets = model.freqItemsets
# frequent_itemsets.show()

association_rules = model.associationRules
# association_rules.show()

def recommend_top10_ads(adId):
    recommendations = association_rules.filter(array_contains(association_rules.antecedent, adId)).orderBy("confidence", ascending=False).limit(10)
    return recommendations.select("consequent")

adId = "f48ed980-d1ed-4c41-b676-c951dccc3e50"
top_10_recommendations = recommend_top10_ads(adId)
# top_10_recommendations.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Hybrid Recommendation

In [11]:
def hybrid_recommendations(content_df, collaborative_df, mba_df, content_weight=0.5, collaborative_weight=0.3, mba_weight=0.2):
    content_df = content_df.withColumn("weighted_similarity", content_df["normalized_similarity"] * lit(content_weight))
    windowSpec = Window.orderBy(lit(1))
    collaborative_df = (collaborative_df.withColumn("rank", row_number().over(windowSpec))
                        .withColumn("collab_score", (lit(1.1) - col("rank") * 0.1) * collaborative_weight))
    mba_df = mba_df.withColumn("mba_score", lit(mba_weight))
    hybrid_df = (content_df.selectExpr("adId", "weighted_similarity as score")
                 .union(collaborative_df.selectExpr("recommended_adId as adId", "collab_score as score"))
                 .union(mba_df.selectExpr("consequent[0] as adId", "mba_score as score")))
    hybrid_df = (hybrid_df.groupBy("adId")
                 .agg({"score": "sum"})
                 .withColumnRenamed("sum(score)", "final_score"))
    hybrid_df = hybrid_df.orderBy("final_score", ascending=False)
    return hybrid_df

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
final_hybrid_recommendations = hybrid_recommendations(recommended_ads_df, final_result, top_10_recommendations)
final_hybrid_recommendations.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+--------------------+
|                adId|         final_score|
+--------------------+--------------------+
|bc4d3e39-7e06-4c3...|  0.5958478797705753|
|a4596d16-e59d-40c...|                 0.5|
|023d7aba-d5df-459...| 0.46063288566578653|
|552bfdf5-f621-4ff...|                0.44|
|d83f913c-87e3-48c...|                 0.3|
|1fc15d7d-cd72-41a...|                0.27|
|1a47c3ae-0788-442...| 0.21000000000000002|
|286c2cf4-aeda-42c...|                 0.2|
|0fd2dc23-5c71-464...| 0.18000000000000002|
|191a9db3-cce6-4bb...|                0.12|
|fc78a2e5-8846-46d...| 0.09000000000000001|
|d7f30c52-c3c5-410...| 0.07781005122490021|
|e123eaef-e554-46a...| 0.07090979722530041|
|60e6c683-e7da-4c5...| 0.06615404225423652|
|1601cd49-e527-4df...| 0.06584521596863807|
|f8931240-4505-4d3...| 0.06363419799324793|
|9b8b83b2-36da-4ce...|  0.0634219260068277|
|c26bd44f-40fa-4f5...| 0.06000000000000002|
|9f15ae6f-2c64-4d9...| 0.05861617166906294|
|87e843ce-a174-446...|0.03000000