In [1]:
import findspark
findspark.init()
findspark.find()
from pyspark.sql import SparkSession

In [2]:
# Initialize Spark session

sc = SparkSession.builder.appName("Product_Recommendation") \
.getOrCreate()

sc.sparkContext.setLogLevel("ERROR")

In [3]:
df = sc.read.option('header', True).csv('/kaggle/input/ecommerce-behavior-data-from-multi-category-store/2019-Nov.csv')

#df = df.sample(0.001, seed=321)

In [4]:
df.printSchema()

root

 |-- event_time: string (nullable = true)

 |-- event_type: string (nullable = true)

 |-- product_id: string (nullable = true)

 |-- category_id: string (nullable = true)

 |-- category_code: string (nullable = true)

 |-- brand: string (nullable = true)

 |-- price: string (nullable = true)

 |-- user_id: string (nullable = true)

 |-- user_session: string (nullable = true)




In [5]:
df.show(5, truncate=False)

+-----------------------+----------+----------+-------------------+-------------------------+------+------+---------+------------------------------------+

|event_time             |event_type|product_id|category_id        |category_code            |brand |price |user_id  |user_session                        |

+-----------------------+----------+----------+-------------------+-------------------------+------+------+---------+------------------------------------+

|2019-11-01 00:00:00 UTC|view      |1003461   |2053013555631882655|electronics.smartphone   |xiaomi|489.07|520088904|4d3b30da-a5e4-49df-b1a8-ba5943f1dd33|

|2019-11-01 00:00:00 UTC|view      |5000088   |2053013566100866035|appliances.sewing_machine|janome|293.65|530496790|8e5f4f83-366c-4f70-860e-ca7417414283|

|2019-11-01 00:00:01 UTC|view      |17302664  |2053013553853497655|null                     |creed |28.31 |561587266|755422e7-9040-477b-9bd2-6a6e8fd97387|

|2019-11-01 00:00:01 UTC|view      |3601530   |20530135638107759

## Preprocess

In [36]:
def preprocess(df):
    
    # Change data types
    df = df.withColumn('event_time', to_timestamp('event_time'))
    df = df.withColumn('user_id', col('user_id').cast('integer'))
    df = df.withColumn('product_id', col('product_id').cast('integer'))
    df = df.withColumn('category_id', col('category_id').cast('long'))
    
    # Limit the number of carts to 1 per session for each user-product pair
    cart_df = df.filter(col('event_type') == 'cart')
    df = df.filter(col('event_type') != 'cart')
    cart_df = cart_df.dropDuplicates(subset=['product_id', 'user_id', 'user_session'])
    df = df.union(cart_df)
    
    # Split category codes into sub categories
    #df = df.withColumn('category', split(df['category_code'], '\.').getItem(0)) \
    #   .withColumn('sub_category', split(df['category_code'], '\.').getItem(1)) \
    #   .withColumn('sub_sub_category', split(df['category_code'], '\.').getItem(2))

    return df

In [38]:
df = preprocess(df)

In [None]:
# Get the timestamp of the most recent event in the df
last_date = df.agg(max('event_time')).collect()[0][0]
df = df.withColumn('last_date', lit(last_date))

# Calculate the recency of each event in terms of days
df = df.withColumn('recency', (col('last_date').cast('double') - col('event_time').cast('double')) / 86400)
df = df.drop('last_date')

# Half-life decay function, the value of an event is halved after 20 days
df = df.withColumn('recency_coef', expr('exp(ln(0.5)*recency/20)'))

In [22]:
# Find the number of views, carts and purchases for each user-product pair
interactions = df.groupby(['user_id', 'product_id']).agg(sum(when(df['event_type'] == 'view', 1) * df['recency_coef']).alias('views'),
                                                         sum(when(df['event_type'] == 'cart', 1) * df['recency_coef']).alias('carts'),
                                                         sum(when(df['event_type'] == 'purchase', 1) * df['recency_coef']).alias('purchases'))
interactions = interactions.na.fill(0)

<font size=3>The interaction matrix is calculated by assigning weights to each type of user interaction. These weighted interactions are then summed to get the total interaction score, which has been subject to a half-life decay function to give more weight to recent interactions. Finally, the values are transformed by taking the logarithm with base 10 and capping the maximum value at 100 to ensure scalability and consistency, while reducing the effect of outliers. The resulting matrix provides a measure of strength of the relationship between users and products.

In [24]:
def calculate_interaction_matrix(df, view_weight=0.1, cart_weight=0.3, purchase_weight=1.0):
 
    # Create a new column with the weighted interaction value
    df = df.withColumn('interaction', view_weight * col('views') + cart_weight * col('carts') + purchase_weight * col('purchases'))
    
    # Use log10 value for views, carts and purchases
    df = df.withColumn('interaction', log10(col('interaction') + 1))
    
    # Set the max possible value to 100 (log100 = 2)
    df = df.withColumn('interaction', when(col('interaction') > 2, 2).otherwise(col('interaction')))

    return df

In [25]:
interaction_matrix = calculate_interaction_matrix(interactions)

In [None]:
#  User-product pairs with the highest interaction scores

interaction_matrix.sort('interaction', ascending=False).show()

<font size=3>After creating an interaction matrix that quantifies user-product interactions using weights, we can feed this matrix into the ALS algorithm. This algorithm uses the matrix to learn the latent factors of both users and products, which can then be used to make personalized recommendations for users. 
    
<font size=3>To optimize the performance of the ALS algorithm, we can use a cross-validation pipeline, which allows us to test and fine-tune different sets of parameters to find the best configuration for our recommendation system. Ultimately, the goal is to create a model that can make accurate and relevant product recommendations to users based on their previous interactions with the system.

In [27]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

def cross_validate_als(interaction_matrix):
    
    # Define the ALS model
    als = ALS(userCol='user_id', itemCol='product_id', ratingCol='interaction', 
              nonnegative=True, coldStartStrategy='drop', implicitPrefs=True)

    # Define the parameter grid for hyperparameter tuning
    param_grid = ParamGridBuilder() \
        .addGrid(als.rank, [5, 10, 15, 20]) \   
        .addGrid(als.regParam, [0.005, 0.01, 0.05, 0.1]) \
        .addGrid(als.alpha, [0, 1.0, 5.0]) \
        .build()
        
    # Define the evaluator for computing the evaluation metrics
    evaluator = RegressionEvaluator(metricName='rmse', labelCol='interaction', predictionCol='prediction')

    # Define the cross-validator for performing 5-fold cross-validation
    cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, collectSubModels=False)

    # Define the pipeline for fitting the model and evaluating it
    pipeline = Pipeline(stages=[cv])

    # Fit the pipeline on the data and evaluate the model
    model = pipeline.fit(interaction_matrix)
    
    return model

In [29]:
model = cross_validate_als(interaction_matrix)

# Model with the lowest RMSE value 
bestModel = model.stages[0].bestModel

# Make predictions
predictions = bestModel.transform(interaction_matrix)

# Calculate RMSE and MAE metrics
evaluator = RegressionEvaluator(metricName='rmse', labelCol='interaction', predictionCol='prediction')
rmse = evaluator.setMetricName('rmse').evaluate(predictions)
mae = evaluator.setMetricName('mae').evaluate(predictions)
print(' rmse:' + str(rmse) + ' mae:' + str(mae))

# Print the parameters of the model with the lowest RMSE value
regParam = bestModel._java_obj.parent().getRegParam()
rank = bestModel._java_obj.parent().getRank()
alpha = bestModel._java_obj.parent().getAlpha()
print('rank:' + str(rank) + ' regParam:' + str(regParam) + ' alpha:' + str(alpha))



 rmse:0.014108922052560617 mae:0.00670138082548715

rank:15 regParam:0.005 alpha:0.0



                                                                                

<font size=3>After identifying the best set of parameters, we can build an ALS model directly, instead of running the cross-validation function repeatedly. This approach saves time and resources, making it more efficient to generate product recommendations.

In [26]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

def simple_als(interaction_matrix):
    
    # Train-test split
    (train, test) = interaction_matrix.randomSplit([0.8, 0.2])
    
    # Initialize the model with the optimized parameters
    als = ALS(userCol='user_id', itemCol='product_id', ratingCol='interaction', 
              alpha=1, regParam=0.005, rank=15, implicitPrefs=True, 
              nonnegative=True, coldStartStrategy='drop')

    # Fit the ALS model on the ratings data
    model = als.fit(train)
    
    # Make predictions
    predictions = model.transform(test)
    
    # Calculate the RMSE and MAE metrics
    evaluator = RegressionEvaluator(metricName='rmse', labelCol='interaction', predictionCol='prediction')
    rmse = evaluator.evaluate(predictions)
    mae = evaluator.setMetricName('mae').evaluate(predictions)
    print('test rmse:' + str(rmse) + ' mae:' + str(mae))
    
    return model

In [27]:
als_model = simple_als(interaction_matrix)



test rmse:0.09362263416331866 mae:0.05455085317688715



                                                                                

## Recommend Products for Users

In [28]:
# 3 random users
user_subset = [565606905, 570112140, 564068124]

# Recommend top 500 products for the users
recommendations = sc.createDataFrame([(user, 0) for user in user_subset], ['user_id', 'product_id'])
recommendations = als_model.recommendForUserSubset(recommendations, 500)

In [29]:
recommendations.show()



+---------+--------------------+

|  user_id|     recommendations|

+---------+--------------------+

|570112140|[{1004833, 0.8508...|

|564068124|[{1004870, 1.1288...|

|565606905|[{1004767, 0.9786...|

+---------+--------------------+





                                                                                

In [30]:
# Select user 564068124

recs_for_user_1 = sc.createDataFrame(recommendations.collect()[1][1])

                                                                                

In [21]:
# Products with the highest number of interactions for the user

interactions.filter(col('user_id') == 564068124).sort('purchases', ascending=False).show()



+---------+----------+------------------+-------------------+------------------+

|  user_id|product_id|             views|              carts|         purchases|

+---------+----------+------------------+-------------------+------------------+

|564068124|   1004833|139.54991783915662|  33.93168190423818| 99.65625067729746|

|564068124|   1004767| 99.10566057156984|  21.35346968009205| 68.77746116880749|

|564068124|   1004856|22.286414097085476|  5.485494261870614|12.006362990183272|

|564068124|   1004870|12.360455924522274| 2.8240715924850974|  8.56032828206366|

|564068124|   1004873|10.745808997156987| 2.9390547109247525|  8.01436540346296|

|564068124|   1005115| 9.649142656908744| 1.1602375014526323| 6.924999866007589|

|564068124|   1003525| 5.740135679629443|                0.0| 4.576104202695865|

|564068124|   1004875| 5.359923757611487|   0.89632110532636| 4.463581838693649|

|564068124|   1307310| 3.429727471133744|  1.795425924579977|2.6126704924519952|

|564068124|   10


                                                                                

In [3]:
sc.stop()