## Get a user's rating (time spent on website) on a certain item

In [2]:
from pyspark.sql import SparkSession, functions as F, Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

### define helper functions

In [3]:
# Initialize Spark session
def init_spark():
    return SparkSession \
        .builder \
        .appName("GA360RECOMMENDER") \
        .getOrCreate()

In [4]:
# finds the average of a row
def calculate_avg(row):
    # Extract values from the row
    item_id = row.itemID
    item_ratings = [row[key] for key in row.__fields__ if key != "itemID" and row[key] is not None]

    total = 0
    if len(item_ratings) > 0:
        for r in item_ratings:
            total = total + float(r)
        avg = total/len(item_ratings)
    else:
        avg = None
        
    # Create a new Row with itemId and userAverage
    return (item_id, avg)

In [42]:
# finds pearson similarity between the target item and a given item
def pearson_correlation(row):
    global item_row
    rowx = item_row
    rowy = row

    sumProdDiffXY = 0
    sumDiffXSquared = 0
    sumDiffYSquared = 0
    
    global average_ratings_dict
    global item_average
    item_x_average = item_average
    item_y_average = average_ratings_dict[str(rowy['itemID'])]
    
    # go through each user rating
    for key in rowx.__fields__:
        # only include the user rating if they rated both items
        if rowx[key] is not None and rowy[key] is not None and key != 'itemID':
            sumProdDiffXY += (float(rowx[key]) - float(item_x_average)) * (float(rowy[key]) - float(item_y_average))
            sumDiffXSquared += (float(rowx[key]) - float(item_x_average)) ** 2
            sumDiffYSquared += (float(rowy[key]) - float(item_y_average)) ** 2

    # denominator = math.sqrt(sumDiffXSquared) * math.sqrt(sumDiffYSquared)
    # if denominator != 0 and str(rowx['itemID']) != str(rowy['itemID']):
    #     similarity = sumProdDiffXY / denominator
    # else:
    #     similarity = None

    similarity = 5
    return similarity

## filtering function

In [43]:
def itemItem_filter(userID, itemID):

    # initialize some global variables (to be used for pearson similarity)
    global average_ratings_dict
    global item_average
    global item_row

    # get the utility matrix
    spark = init_spark()
    item_rdd = spark.read.csv("../../data/transposed_utility.csv", header=True).rdd

    # get the average rating for each item
    average_ratings = item_rdd.map(calculate_avg)
    average_ratings_dict = dict(average_ratings.collect())

    # target item info
    item_average = average_ratings.lookup(itemID)[0]
    item_row = item_rdd.filter(lambda x: x['itemID'] == itemID).collect()[0]

    # get items by their similarity to the target item
    # format (itemId, similarity)
    all_similarities = item_rdd.map(lambda x: (x.itemID, pearson_correlation(x))).filter(lambda x: x[1] is not None)


    for i in all_similarities.collect():
        print(i)  

In [44]:
itemItem_filter('1589021726696497303', 'GGOEGBRA037499')

('GGOEA0CH077599', 5)
('GGOEACCQ017299', 5)
('GGOEADHH055999', 5)
('GGOEADHH073999', 5)
('GGOEADWQ015699', 5)
('GGOEAFKQ020499', 5)
('GGOEAFKQ020599', 5)
('GGOEAHPA004110', 5)
('GGOEAHPJ074410', 5)
('GGOEAKDH019899', 5)
('GGOEAOCB077499', 5)
('GGOEAXXX0808', 5)
('GGOEAXXX0810', 5)
('GGOEAXXX0812', 5)
('GGOEGAAX0037', 5)
('GGOEGAAX0074', 5)
('GGOEGAAX0081', 5)
('GGOEGAAX0098', 5)
('GGOEGAAX0104', 5)
('GGOEGAAX0105', 5)
('GGOEGAAX0106', 5)
('GGOEGAAX0107', 5)
('GGOEGAAX0168', 5)
('GGOEGAAX0278', 5)
('GGOEGAAX0280', 5)
('GGOEGAAX0282', 5)
('GGOEGAAX0290', 5)
('GGOEGAAX0296', 5)
('GGOEGAAX0299', 5)
('GGOEGAAX0304', 5)
('GGOEGAAX0306', 5)
('GGOEGAAX0308', 5)
('GGOEGAAX0309', 5)
('GGOEGAAX0313', 5)
('GGOEGAAX0318', 5)
('GGOEGAAX0320', 5)
('GGOEGAAX0323', 5)
('GGOEGAAX0325', 5)
('GGOEGAAX0326', 5)
('GGOEGAAX0329', 5)
('GGOEGAAX0330', 5)
('GGOEGAAX0334', 5)
('GGOEGAAX0338', 5)
('GGOEGAAX0341', 5)
('GGOEGAAX0342', 5)
('GGOEGAAX0351', 5)
('GGOEGAAX0352', 5)
('GGOEGAAX0353', 5)
('GGOEGAAX0356', 5