In [21]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [226]:
import pandas as pd
import math

In [33]:
# flip the rows and columns in the csv (since I need item rows for item-item)

transposed_csv = pd.read_csv('../data/utility.csv', header=None).T
transposed_csv[0][0] = "itemID"
transposed_csv.to_csv('../data/transposed_utility.csv', header=False, index=False)

In [34]:
from pyspark.sql import SparkSession, functions as F, Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark session
def init_spark():
    return SparkSession \
        .builder \
        .appName("GA360RECOMMENDER") \
        .getOrCreate()

spark = init_spark()

In [191]:
# import the utility matrix=
df = spark.read.csv("../data/transposed_utility.csv", header=True)

In [192]:
userID = input("Enter the user ID: ") 
itemID = input("Enter the item ID: ") 

Enter the user ID:  8807566194592989592
Enter the item ID:  GGOEA0CH077599


In [193]:
# switch the df into an rdd where each element is an item
item_rdd = df.rdd

In [194]:
# this function will execute for each item to find its average
def calculate_avg(row):
    # Extract values from the row
    item_id = row.itemID
    item_ratings = [row[key] for key in row.__fields__ if key != "itemID" and row[key] is not None]

    total = 0
    if len(item_ratings) > 0:
        for r in item_ratings:
            total = total + float(r)
        avg = total/len(item_ratings)
    else:
        avg = None
        
    # Create a new Row with itemId and userAverage
    return (item_id, avg)

In [196]:
# get the average rating for each item
average_ratings = item_rdd.map(calculate_avg)

In [210]:
# get info on the target item
item_average = average_ratings.lookup(itemID)[0]
item_row = item_rdd.filter(lambda x: x['itemID'] == itemID).collect()[0]

In [227]:
# use Pearson correlation coefficient to define similarity between two items
def pearson_correlation(row, average_ratings):
    rowx = item_row
    rowy = row

    sumProdDiffXY = 0
    sumDiffXSquared = 0
    sumDiffYSquared = 0
    item_average = average_ratings.lookup(rowx['itemID'])[0]
    
    # go through each user rating
    for key in rowx.__fields__:
        # only include the user rating if they rated both items
        if rowx[key] is not None and rowy[key] is not None and key != 'itemID':
            sumProdDiffXY += (float(rowx[key]) - float(item_average)) * (float(rowy[key]) - float(item_average))
            sumDiffXSquared += (float(rowx[key]) - float(item_average)) ** 2
            sumDiffYSquared += (float(rowy[key]) - float(item_average)) ** 2
            
    similarity = sumProdDiffXY / ( math.sqrt(sumDiffXSquared) * math.sqrt(sumDiffXSquared) )

    return similarity
    
similarity = pearson_correlation(item_row, average_ratings)
print(similarity)

1.0000000000000002
