In [319]:
from pyspark.sql import SparkSession, functions as F, Window, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import avg
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType
from itertools import combinations
from pyspark.sql import Row

import pandas as pd
import math


In [320]:
# Initialize Spark session
def init_spark():
    return SparkSession \
        .builder \
        .appName("GA360RECOMMENDER") \
        .getOrCreate()

In [321]:
# finds the average of an item row
def calculate_avg_user(row):
    # Extract values from the row
    user_id = row.fullVisitorId
    user_ratings = [row[key] for key in row.__fields__ if key != "fullVisitorId" and row[key] is not None]

    total = 0
    if len(user_ratings) > 0:
        for r in user_ratings:
            total = total + float(r)
        avg = total/len(user_ratings)
    else:
        avg = None
        
    # Create a new Row with itemId and userAverage
    return (user_id, avg)

In [322]:
# get user rating averages
def calculate_avg_item(user_df):
    averages = user_df.agg(*(avg(col).alias(col) for col in user_df.columns)).rdd
    return averages

In [323]:
def total_user_average(user_averages):

    values_rdd = user_averages.values()
    
    total_sum = values_rdd.sum()
    count = values_rdd.count()
    average = total_sum / count

    return average

In [324]:
# # Pearson correlation coefficient
# #  (similarity) Sxy = items rated by both users x and y
# def pearson_correlation(row):
    
#      global user_row
# #     global average_ratings_user_dict
#      global user_average

# # Checking for common items between selected users
#     common_items = set(user_x) & set(user_y)
#     n_common_items = len(common_items)
    
#     if n_common_items == 0:
#         similarity = 0.0
    
#     mean_user_x = sum(user_x[item] for item in common_items) / n_common_items
#     mean_user_y = sum(user_y[item] for item in common_items) / n_common_items
     
#     numerator = sum((user_x[item] - mean_user_x) * (user_y[item] - mean_user_y) for item in common_items)
#     denominator_user_x = sum((user_x[item] - mean_user_x)**2 for item in common_items)
#     denominator_user_y = sum((user_y[item] - mean_user_y)**2 for item in common_items)
    
#     if denominator_user_x == 0 or denominator_user_y == 0:
#         similarity = 0.0
#     else:
#         similarity = numerator / (denominator_user_x**0.5 * denominator_user_y**0.5)


#     return similarity
    

In [372]:
def pearson_correlation(row):
    
    global user_row
    global average_ratings_users_dict
    global user_average
    
    rowx = user_row
    rowy = row

    sumProdDiffXY = 0
    sumDiffXSquared = 0
    sumDiffYSquared = 0
    
    user_x_average = user_average
    user_y_average = average_ratings_users_dict[str(rowy['fullVisitorId'])]
    
    # go through each user rating
    for key in rowx.__fields__:
        # only include the user rating if they rated both items
        if rowx[key] is not None and rowy[key] is not None and key != 'fullVisitorId':
            sumProdDiffXY += (float(rowx[key]) - float(user_x_average)) * (float(rowy[key]) - float(user_y_average))
            sumDiffXSquared += (float(rowx[key]) - float(user_x_average)) ** 2
            sumDiffYSquared += (float(rowy[key]) - float(user_y_average)) ** 2

    denominator = math.sqrt(sumDiffXSquared) * math.sqrt(sumDiffYSquared)
    if denominator != 0 and str(rowx['fullVisitorId']) != str(rowy['fullVisitorId']):
        similarity = sumProdDiffXY / denominator
    else:
        similarity = None

    return similarity

In [None]:
# calculate the rating of an item given its similar items 

def calculate_rating(userID, itemID, similar_items, item_rdd, user_averages, item_averages, overall_average):

    bx = float(user_averages.collect()[0][userID]) - overall_average
    bi = float(item_averages.lookup(itemID)[0]) - overall_average
    bxi = overall_average + bx + bi
    
    sumSR = 0   # (similarity of users i and j) * (user's rating on item j)
    sumS = 0    # similarity of items i and j
    
    for item in similar_items:
        item_row = item_rdd.filter(lambda row: row.itemID == item[0]).collect()[0]
        S = item[1]
        R = item_row[userID]
        bj = float(item_averages.lookup(item[0])[0]) - overall_average
        bxy = overall_average + bx + bj
        sumSR += float(S)*float(R-bj)
        sumS += float(S)
    
    rating = bxi + sumSR/sumS

    return rating

In [446]:
def user_user_filter(user_df,userID, itemID):
    
    global average_ratings_users_dict
    global user_average
    global user_row
    
    user_rdd = user_df.rdd
    
     # get the average rating for each user
    average_ratings_users = user_rdd.map(calculate_avg_user)
    average_ratings_users_dict = dict(average_ratings_users.collect())
   
    
     # get the average rating for each item
    average_ratings_items = calculate_avg_item(user_df)
    
    # get the total user average
    overall_average = total_user_average(average_ratings_users)
    
    
     # target user info
    user_average = average_ratings_users.lookup(userID)[0]
    user_row = user_rdd.filter(lambda x: x['fullVisitorId'] == userID).collect()[0]
#     print(user_row)

#     random_row = user_rdd.takeSample(False, 1000)
#     for row in random_row:
#         print("ID is:", str(row['fullVisitorId']))
#         print(average_ratings_users_dict[str(row['fullVisitorId'])])
#         corr = pearson_correlation(row)
#         print(corr)
# #     print(row)
#     print(random_row)
     # get users by their similarity to the target user
     # format (fullVisitorId, similarity)
#     corr = pearson_correlation(random_row)
#     print(corr)
    all_similarities = user_rdd.map(lambda x: (x['fullVisitorId'] == userID, pearson_correlation(x))).filter(lambda x: x[1] is not None)
    print(all_similarities.collect())
    
    # get a list of users that rated the item 
#     rated_users = user_rdd.filter(lambda x:x['itemID'] == itemID is not None)
    rated_users = user_rdd.filter(lambda x: any(val is not None for val in x if val != 'fullVisitorId'))
    print(rated_users.collect())
    rated_users= rated_users.map(lambda row: row['fullVisitorId'] == userID).collect()
#     print(rated_users)
    
    N = 3
    filtered_similar_items = all_similarities.filter(lambda x: x[0] in rated_users).sortBy(lambda x: x[1], ascending=False).take(N)
    for f in filtered_similar_items:
        print(f)


In [447]:
# # Creating the correlation df, for similar users

# def correlation(user_df,target_user):
    
#     # Convert DataFrame to RDD of dictionaries (each dictionary representing a user's ratings)
#     user_rdd = user_df.rdd
    
# #     user_rdd = user_rdd.map(lambda x: (x.fullVisitorId, pearson_correlation(x,y))).filter(lambda x: x[1] is not None)
    
#     user_row = user_rdd.filter(lambda x: x['fullVisitorId'] == target_user).collect()[
    
#     user_combinations = user_rdd.cartesian(user_rdd)
#     correlation_rdd = user_combinations.map(lambda pair: ((pair[0]['user'], pair[1]['user']), pearson_correlation(pair[0], pair[1])))
#     correlation_rdd = correlation_rdd.filter(lambda x: x[0][0] != x[0][1] and x[1] > 0)
    
#     correlation_rdd = correlation_rdd.filter(lambda x: x[0][0] != x[0][1] and x[1] > 0)
    
#     # Convert RDD to DataFrame
#     correlation_df = spark.createDataFrame(correlation_rdd, schema=StructType([
#         StructField("user1", StringType(), True),
#         StructField("user2", StringType(), True),
#         StructField("correlation", FloatType(), True)]))
    
# #     correlation_df = correlation_rdd.map(lambda x: (x[0][0], x[0][1], x[1])).toDF(["user1", "user2", "correlation"])

#     return user_row
    

In [448]:
# get the utility matrix
spark = init_spark()
user_df = spark.read.csv("../../data/utility.csv", header=True)

rating = user_user_filter(user_df, '36445151010876646', 'GGOEADHH055999')
print(rating)

                                                                                

[(False, 1.0), (False, -1.0), (False, -1.0), (False, -1.0), (False, -1.0), (False, 0.206385016795438), (False, -0.13598843790753096), (False, -1.0), (False, 1.0), (False, 0.9937062014139494), (False, 0.9050246477156102), (False, -0.4314972385511188), (False, 1.0), (False, -0.9590292116736348), (False, -1.0), (False, 1.0), (False, 1.0), (False, -1.0), (False, 1.0), (False, -1.0), (False, 1.0), (False, 0.5854249763404723), (False, -1.0), (False, -1.0), (False, 0.11733717703469859), (False, 1.0), (False, -1.0)]
[Row(fullVisitorId='1311003026247678016', GGOEA0CH077599=None, GGOEACCQ017299=None, GGOEADHH055999=None, GGOEADHH073999=None, GGOEADWQ015699=None, GGOEAFKQ020499=None, GGOEAFKQ020599=None, GGOEAHPA004110=None, GGOEAHPJ074410=None, GGOEAKDH019899=None, GGOEAOCB077499=None, GGOEAXXX0808=None, GGOEAXXX0810=None, GGOEAXXX0812=None, GGOEGAAX0037=None, GGOEGAAX0074=None, GGOEGAAX0081=None, GGOEGAAX0098=None, GGOEGAAX0104=None, GGOEGAAX0105=None, GGOEGAAX0106=None, GGOEGAAX0107=None, GGOE

(False, 1.0)
(False, 1.0)
(False, 1.0)
None




In [None]:
# -0.07907282136027685
# 1.0
# GGOEGBRA037499='-0.12373104588049912'