## Get a user's rating (time spent on website) on a certain item

In [51]:
from pyspark.sql import SparkSession, functions as F, Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd
import math

### define helper functions

In [52]:
# Initialize Spark session
def init_spark():
    return SparkSession \
        .builder \
        .appName("GA360RECOMMENDER") \
        .getOrCreate()

In [53]:
# finds the average of a row
def calculate_avg(row):
    # Extract values from the row
    item_id = row.itemID
    item_ratings = [row[key] for key in row.__fields__ if key != "itemID" and row[key] is not None]

    total = 0
    if len(item_ratings) > 0:
        for r in item_ratings:
            total = total + float(r)
        avg = total/len(item_ratings)
    else:
        avg = None
        
    # Create a new Row with itemId and userAverage
    return (item_id, avg)

In [57]:
# finds pearson similarity between the target item and a given item
def pearson_correlation(row):
    global item_row
    rowx = item_row
    rowy = row

    sumProdDiffXY = 0
    sumDiffXSquared = 0
    sumDiffYSquared = 0
    
    global average_ratings_dict
    global item_average
    item_x_average = item_average
    item_y_average = average_ratings_dict[str(rowy['itemID'])]
    
    # go through each user rating
    for key in rowx.__fields__:
        # only include the user rating if they rated both items
        if rowx[key] is not None and rowy[key] is not None and key != 'itemID':
            sumProdDiffXY += (float(rowx[key]) - float(item_x_average)) * (float(rowy[key]) - float(item_y_average))
            sumDiffXSquared += (float(rowx[key]) - float(item_x_average)) ** 2
            sumDiffYSquared += (float(rowy[key]) - float(item_y_average)) ** 2

    denominator = math.sqrt(sumDiffXSquared) * math.sqrt(sumDiffYSquared)
    if denominator != 0 and str(rowx['itemID']) != str(rowy['itemID']):
        similarity = sumProdDiffXY / denominator
    else:
        similarity = None

    return similarity

In [92]:
# calculate the rating of an item given its similar items
def calculate_rating(userID, similar_items, item_rdd):
    
    sumSR = 0   # (similarity of items i and j) * (user's rating on item j)
    sumS = 0    # similarity of items i and j
    
    for item in similar_items:
        item_row = item_rdd.filter(lambda row: row.itemID == item[0]).collect()[0]
        S = item[1]
        R = item_row[userID]
        sumSR += float(S)*float(R)
        sumS += float(S)
    
    rating = sumSR/sumS

    return rating

## filtering function

In [93]:
def itemItem_filter(userID, itemID):

    # initialize some global variables (to be used for pearson similarity)
    global average_ratings_dict
    global item_average
    global item_row

    # get the utility matrix
    spark = init_spark()
    item_rdd = spark.read.csv("../../data/transposed_utility.csv", header=True).rdd

    # get the average rating for each item
    average_ratings = item_rdd.map(calculate_avg)
    average_ratings_dict = dict(average_ratings.collect())

    # target item info
    item_average = average_ratings.lookup(itemID)[0]
    item_row = item_rdd.filter(lambda x: x['itemID'] == itemID).collect()[0]

    # get items by their similarity to the target item
    # format (itemId, similarity)
    all_similarities = item_rdd.map(lambda x: (x.itemID, pearson_correlation(x))).filter(lambda x: x[1] is not None)

    # get a list of items the user has rated
    rated_items = item_rdd.filter(lambda x: x[userID] is not None).map(lambda row: row.itemID).collect()

    # get the N closest items that the user has rated
    N = 3
    filtered_similar_items = all_similarities.filter(lambda x: x[0] in rated_items) \
        .sortBy(lambda x: x[1], ascending=False).take(N)

    rating = calculate_rating(userID, filtered_similar_items, item_rdd)
    
    return rating


In [94]:
rating = itemItem_filter('1589021726696497303', 'GGOEGBRA037499')
print(rating)

-0.17154096031163024
