In [146]:
from pyspark.sql import SparkSession, functions as F, Window, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import avg
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType
from itertools import combinations
from pyspark.sql import Row

import pandas as pd
import math


In [147]:
# Initialize Spark session
def init_spark():
    return SparkSession \
        .builder \
        .appName("GA360RECOMMENDER") \
        .getOrCreate()

In [148]:
# finds the average of an item row
def calculate_avg_user(row):
    # Extract values from the row
    user_id = row.fullVisitorId
    user_ratings = [row[key] for key in row.__fields__ if key != "fullVisitorId" and row[key] is not None]

    total = 0
    if len(user_ratings) > 0:
        for r in user_ratings:
            total = total + float(r)
        avg = total/len(user_ratings)
    else:
        avg = None
        
    # Create a new Row with itemId and userAverage
    return (user_id, avg)

In [149]:
# get user rating averages
def calculate_avg_item(user_df):
    averages = user_df.agg(*(avg(col).alias(col) for col in user_df.columns)).rdd
    return averages

In [150]:
def total_user_average(user_averages):

    values_rdd = user_averages.values()
    
    total_sum = values_rdd.sum()
    count = values_rdd.count()
    average = total_sum / count

    return average

In [151]:
def pearson_correlation(row):
    
    global user_row
    global average_ratings_users_dict
    global user_average
    
    rowx = user_row
    rowy = row

    sumProdDiffXY = 0
    sumDiffXSquared = 0
    sumDiffYSquared = 0
    
    user_x_average = user_average
    user_y_average = average_ratings_users_dict[str(rowy['fullVisitorId'])]
    
    # go through each user rating
    for key in rowx.__fields__:
        # only include the user rating if they rated both items
        if rowx[key] is not None and rowy[key] is not None and key != 'fullVisitorId':
            sumProdDiffXY += (float(rowx[key]) - float(user_x_average)) * (float(rowy[key]) - float(user_y_average))
            sumDiffXSquared += (float(rowx[key]) - float(user_x_average)) ** 2
            sumDiffYSquared += (float(rowy[key]) - float(user_y_average)) ** 2

    denominator = math.sqrt(sumDiffXSquared) * math.sqrt(sumDiffYSquared)
    if denominator != 0 and str(rowx['fullVisitorId']) != str(rowy['fullVisitorId']):
        similarity = sumProdDiffXY / denominator
    else:
        similarity = None

    return similarity

In [152]:
# calculate the rating of an item given its similar items 

def calculate_rating(userID, itemID, filtered_similar_users, user_rdd, average_ratings_items, average_ratings_users, overall_average):

    bx = float(average_ratings_items.collect()[0][itemID]) - overall_average
    bi = float(average_ratings_users.lookup(userID)[0]) - overall_average
    bxi = overall_average + bx + bi
    
    sumSR = 0   # (similarity of users i and j) * (user's rating on item j)
    sumS = 0    # similarity of items i and j
    
    
    for user in filtered_similar_users:
        user_row = user_rdd.filter(lambda row: row['fullVisitorId'] == user[0]).collect()[0]

        S = user[1]
        R = user_row[itemID]
      
        bj = float(average_ratings_users.lookup(user[0])[0]) - overall_average
        
        if R is not None: 
            R = float(R)
            bxy = overall_average + bx + bj
            sumSR += float(S)*float(R-bj)
            sumS += float(S)              
            
    rating = bxi + sumSR/sumS

    return rating

In [156]:
def user_user_filter(user_df,userID, itemID):
    
    global average_ratings_users_dict
    global user_average
    global user_row
    
    user_rdd = user_df.rdd
    
     # get the average rating for each user
    average_ratings_users = user_rdd.map(calculate_avg_user)
    average_ratings_users_dict = dict(average_ratings_users.collect())
   
    
     # get the average rating for each item
    average_ratings_items = calculate_avg_item(user_df)

    
    # get the total user average
    overall_average = total_user_average(average_ratings_users)         
    
     # target user info
    user_average = average_ratings_users.lookup(userID)[0]
    user_row = user_rdd.filter(lambda x: x['fullVisitorId'] == userID).collect()[0]


    all_similarities = user_rdd.map(lambda x: (x['fullVisitorId'], pearson_correlation(x))).filter(lambda x: x[1] is not None)

    
    # get a list of users that rated the item 
    rated_users = user_rdd.filter(lambda x: any(val is not None for val in x if val != 'fullVisitorId'))


    rated_users= rated_users.map(lambda row: row['fullVisitorId']).collect()

    N = 5
    filtered_similar_users = all_similarities.filter(lambda x: x[0] in rated_users).sortBy(lambda x: x[1], ascending=False).take(N)
    
    rating = calculate_rating(userID, itemID, filtered_similar_users, user_rdd, average_ratings_items, average_ratings_users, overall_average)

    return rating

In [160]:
# get the utility matrix
spark = init_spark()
user_df = spark.read.csv("../../data/utility.csv", header=True)

rating = user_user_filter(user_df, '1589021726696497303', 'GGOEGBRA037499')
print(rating)

-0.24903050493580953
