## Get a user's rating (time spent on website) on a certain item

In [3]:
from pyspark.sql import SparkSession, functions as F, Window, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import avg

import pandas as pd
import math

### define helper functions

In [4]:
# Initialize Spark session
def init_spark():
    return SparkSession \
        .builder \
        .appName("GA360RECOMMENDER") \
        .getOrCreate()

In [5]:
# finds the average of an item row
def calculate_avg_item(row):
    # Extract values from the row
    item_id = row.itemID
    item_ratings = [row[key] for key in row.__fields__ if key != "itemID" and row[key] is not None]

    total = 0
    if len(item_ratings) > 0:
        for r in item_ratings:
            total = total + float(r)
        avg = total/len(item_ratings)
    else:
        avg = None
        
    # Create a new Row with itemId and userAverage
    return (item_id, avg)

In [6]:
def total_item_average(item_averages):

    values_rdd = item_averages.values()
    
    total_sum = values_rdd.sum()
    count = values_rdd.count()
    average = total_sum / count

    return average

In [7]:
# get user rating averages
def calculate_avg_user(item_df):
    averages = item_df.agg(*(avg(col).alias(col) for col in item_df.columns)).rdd
    return averages

Pearson similarity:
![image.png](attachment:53b7e242-7f99-4cfa-81c8-ba9d37f19dc0.png)![image.png](attachment:aba15e63-4625-43cc-99d7-827ead833244.png)

In [24]:
# finds pearson similarity between the target item and a given item
def pearson_correlation(row):
    global item_row
    rowx = item_row
    rowy = row

    sumProdDiffXY = 0
    sumDiffXSquared = 0
    sumDiffYSquared = 0
    
    global average_ratings_items_dict
    global item_average
    item_x_average = item_average
    item_y_average = average_ratings_items_dict[str(rowy['itemID'])]
    
    # go through each user rating
    for key in rowx.__fields__:
        # only include the user rating if they rated both items
        if rowx[key] is not None and rowy[key] is not None and key != 'itemID':
            sumProdDiffXY += (float(rowx[key]) - float(item_x_average)) * (float(rowy[key]) - float(item_y_average))
            sumDiffXSquared += (float(rowx[key]) - float(item_x_average)) ** 2
            sumDiffYSquared += (float(rowy[key]) - float(item_y_average)) ** 2

    denominator = math.sqrt(sumDiffXSquared) * math.sqrt(sumDiffYSquared)
    if denominator != 0 and str(rowx['itemID']) != str(rowy['itemID']):
        similarity = sumProdDiffXY / denominator
    else:
        similarity = None

    return similarity

rating estimation:
![image.png](attachment:d0123a18-701b-4cbc-8e31-d830a176aa28.png)

In [25]:
# calculate the rating of an item given its similar items
def calculate_rating(userID, itemID, similar_items, item_rdd, user_averages, item_averages, overall_average):

    bx = user_averages.collect()[0][userID] - overall_average
    bi = item_averages.lookup(itemID)[0] - overall_average
    bxi = overall_average + bx + bi
    
    sumSR = 0   # (similarity of items i and j) * (user's rating on item j)
    sumS = 0    # similarity of items i and j
    
    for item in similar_items:
        item_row = item_rdd.filter(lambda row: row.itemID == item[0]).collect()[0]
        S = item[1]
        R = item_row[userID]
        bj = item_averages.lookup(item[0])[0] - overall_average
        print(bj)
        sumSR += float(S)*float(R)
        sumS += float(S)
    
    rating = bxi + sumSR/sumS

    return rating

## filtering function

In [28]:
def itemItem_filter(item_df, userID, itemID):

    # initialize some global variables (to be used for pearson similarity)
    global average_ratings_dict
    global item_average
    global item_row

    item_rdd = item_df.rdd

    # get the average rating for each item
    average_ratings_items = item_rdd.map(calculate_avg_item)
    average_ratings_items_dict = dict(average_ratings_items.collect())

    # get the average rating for each user
    average_ratings_users = calculate_avg_user(item_df)

    # get the total item average
    overall_average = total_item_average(average_ratings_items)

    # target item info
    item_average = average_ratings_items.lookup(itemID)[0]
    item_row = item_rdd.filter(lambda x: x['itemID'] == itemID).collect()[0]

    # get items by their similarity to the target item
    # format (itemId, similarity)
    all_similarities = item_rdd.map(lambda x: (x.itemID, pearson_correlation(x))).filter(lambda x: x[1] is not None)

    # get a list of items the user has rated
    rated_items = item_rdd.filter(lambda x: x[userID] is not None).map(lambda row: row.itemID).collect()

    # get the N closest items that the user has rated
    N = 3
    filtered_similar_items = all_similarities.filter(lambda x: x[0] in rated_items) \
        .sortBy(lambda x: x[1], ascending=False).take(N)

    rating = calculate_rating(userID, itemID, filtered_similar_items, item_rdd, average_ratings_users, average_ratings_items, overall_average)
    
    return rating


In [29]:
# get the utility matrix
spark = init_spark()
item_rdd = spark.read.csv("../../data/transposed_utility.csv", header=True)

rating = itemItem_filter(item_rdd, '1589021726696497303', 'GGOEGBRA037499')
print(rating)

{'GGOEA0CH077599': -0.15220056249771882, 'GGOEACCQ017299': -0.18318763678019928, 'GGOEADHH055999': 0.04092047388905796, 'GGOEADHH073999': -0.07471663448399984, 'GGOEADWQ015699': -0.16338988814673905, 'GGOEAFKQ020499': -0.16773925325385858, 'GGOEAFKQ020599': -0.13017396321512692, 'GGOEAHPA004110': -0.19546682126366466, 'GGOEAHPJ074410': -0.16957333461876983, 'GGOEAKDH019899': -0.16247057354123093, 'GGOEAOCB077499': -0.1649269518481746, 'GGOEAXXX0808': 0.41492802852482025, 'GGOEAXXX0810': -0.1955252935707288, 'GGOEAXXX0812': -0.16295946699751707, 'GGOEGAAX0037': 0.007569128718516085, 'GGOEGAAX0074': -0.1046544557008297, 'GGOEGAAX0081': -0.07038805953050313, 'GGOEGAAX0098': -0.184162175231268, 'GGOEGAAX0104': 0.1620775304720439, 'GGOEGAAX0105': -0.16334603391644098, 'GGOEGAAX0106': -0.13585917690404886, 'GGOEGAAX0107': -0.19553503895523947, 'GGOEGAAX0168': -0.1778666568373643, 'GGOEGAAX0278': -0.17392627303354327, 'GGOEGAAX0280': -0.1955252935707288, 'GGOEGAAX0282': -0.012779843226329876,

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 53.0 failed 1 times, most recent failure: Lost task 0.0 in stage 53.0 (TID 53) (LAPTOP-CLUDENQ3 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 1247, in main
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 1237, in process
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py", line 5434, in pipeline_func
    return func(split, prev_func(split, iterator))
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py", line 5434, in pipeline_func
    return func(split, prev_func(split, iterator))
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py", line 5434, in pipeline_func
    return func(split, prev_func(split, iterator))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py", line 840, in func
    return f(iterator)
           ^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py", line 1499, in sortPartition
    return iter(sort(iterator, key=lambda kv: keyfunc(kv[0]), reverse=(not ascending)))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\shuffle.py", line 494, in sorted
    chunk = list(itertools.islice(iterator, batch))
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Temp\ipykernel_14516\1259716849.py", line 26, in <lambda>
  File "C:\Users\melis\AppData\Local\Temp\ipykernel_14516\2328167361.py", line 14, in pearson_correlation
NameError: name 'average_ratings_items_dict' is not defined

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	at java.base/java.lang.Thread.run(Thread.java:832)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:181)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:64)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:832)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 1247, in main
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 1237, in process
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py", line 5434, in pipeline_func
    return func(split, prev_func(split, iterator))
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py", line 5434, in pipeline_func
    return func(split, prev_func(split, iterator))
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py", line 5434, in pipeline_func
    return func(split, prev_func(split, iterator))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py", line 840, in func
    return f(iterator)
           ^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\rdd.py", line 1499, in sortPartition
    return iter(sort(iterator, key=lambda kv: keyfunc(kv[0]), reverse=(not ascending)))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\shuffle.py", line 494, in sorted
    chunk = list(itertools.islice(iterator, batch))
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\util.py", line 83, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\melis\AppData\Local\Temp\ipykernel_14516\1259716849.py", line 26, in <lambda>
  File "C:\Users\melis\AppData\Local\Temp\ipykernel_14516\2328167361.py", line 14, in pearson_correlation
NameError: name 'average_ratings_items_dict' is not defined

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630)
	... 1 more
