In [21]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [226]:
import pandas as pd
import math

In [33]:
# flip the rows and columns in the csv (since I need item rows for item-item)

transposed_csv = pd.read_csv('../data/utility.csv', header=None).T
transposed_csv[0][0] = "itemID"
transposed_csv.to_csv('../data/transposed_utility.csv', header=False, index=False)

In [309]:
from pyspark.sql import SparkSession, functions as F, Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark session
def init_spark():
    return SparkSession \
        .builder \
        .appName("GA360RECOMMENDER") \
        .getOrCreate()

spark = init_spark()

In [310]:
# import the utility matrix
df = spark.read.csv("../data/transposed_utility.csv", header=True)

In [434]:
# choose which rating to predict
userID = input("Enter the user ID: ") 
itemID = input("Enter the item ID: ") 

Enter the user ID:  1059770176518504832
Enter the item ID:  GGOEGAAX0037


In [467]:
# switch the df into an rdd where each element is an item
item_rdd = df.rdd

In [468]:
# finds the average of a row
def calculate_avg(row):
    # Extract values from the row
    item_id = row.itemID
    item_ratings = [row[key] for key in row.__fields__ if key != "itemID" and row[key] is not None]

    total = 0
    if len(item_ratings) > 0:
        for r in item_ratings:
            total = total + float(r)
        avg = total/len(item_ratings)
    else:
        avg = None
        
    # Create a new Row with itemId and userAverage
    return (item_id, avg)

In [437]:
# get the average rating for each item (convert to a dict because you can't map inside a map (for the similarity function))
average_ratings = item_rdd.map(calculate_avg)
average_ratings_dict = dict(average_ratings.collect())

In [469]:
# target item info
item_average = average_ratings.lookup(itemID)[0]
item_row = item_rdd.filter(lambda x: x['itemID'] == itemID).collect()[0]

In [470]:
# finds pearson similarity between the target item and a given item
def pearson_correlation(row):
    global item_row
    rowx = item_row
    rowy = row

    sumProdDiffXY = 0
    sumDiffXSquared = 0
    sumDiffYSquared = 0
    
    global average_ratings_dict
    global item_average
    item_x_average = item_average
    item_y_average = average_ratings_dict[str(rowy['itemID'])]
    
    # go through each user rating
    for key in rowx.__fields__:
        # only include the user rating if they rated both items
        if rowx[key] is not None and rowy[key] is not None and key != 'itemID':
            sumProdDiffXY += (float(rowx[key]) - float(item_x_average)) * (float(rowy[key]) - float(item_y_average))
            sumDiffXSquared += (float(rowx[key]) - float(item_x_average)) ** 2
            sumDiffYSquared += (float(rowy[key]) - float(item_y_average)) ** 2

    denominator = math.sqrt(sumDiffXSquared) * math.sqrt(sumDiffXSquared)
    if denominator != 0 and str(rowx['itemID']) != str(rowy['itemID']):
        similarity = sumProdDiffXY / denominator
    else:
        similarity = None
        
    return similarity

In [440]:
# get all items by their similarity to the target item
# format (itemId, similarity)
all_similarities = item_rdd.map(lambda x: (x.itemID, pearson_correlation(x)))

for i in all_similarities.collect():
    print(i)   

('GGOEA0CH077599', None)
('GGOEACCQ017299', None)
('GGOEADHH055999', None)
('GGOEADHH073999', None)
('GGOEADWQ015699', None)
('GGOEAFKQ020499', None)
('GGOEAFKQ020599', None)
('GGOEAHPA004110', None)
('GGOEAHPJ074410', None)
('GGOEAKDH019899', -0.5050991176807608)
('GGOEAOCB077499', None)
('GGOEAXXX0808', None)
('GGOEAXXX0810', None)
('GGOEAXXX0812', None)
('GGOEGAAX0037', None)
('GGOEGAAX0074', None)
('GGOEGAAX0081', None)
('GGOEGAAX0098', 0.0)
('GGOEGAAX0104', 14.956170884422002)
('GGOEGAAX0105', None)
('GGOEGAAX0106', None)
('GGOEGAAX0107', None)
('GGOEGAAX0168', None)
('GGOEGAAX0278', None)
('GGOEGAAX0280', None)
('GGOEGAAX0282', None)
('GGOEGAAX0290', None)
('GGOEGAAX0296', None)
('GGOEGAAX0299', None)
('GGOEGAAX0304', None)
('GGOEGAAX0306', None)
('GGOEGAAX0308', None)
('GGOEGAAX0309', None)
('GGOEGAAX0313', None)
('GGOEGAAX0318', None)
('GGOEGAAX0320', None)
('GGOEGAAX0323', None)
('GGOEGAAX0325', None)
('GGOEGAAX0326', None)
('GGOEGAAX0329', None)
('GGOEGAAX0330', None)
('GGOEG

In [451]:
# get a list of items the user has rated
rated_items = item_rdd.filter(lambda x: x[userID] is not None).map(lambda row: row.itemID).collect()

print(rated_items)

['GGOEAHPA004110', 'GGOEGAAX0104', 'GGOEGAAX0596', 'GGOEGBRA037499']


In [454]:
# get the N closest values that the user has rated
N = 3
filtered_similar_items = all_similarities.filter(lambda x: x[1] is not None) \
    .filter(lambda x: x[0] in rated_items) \
    .sortBy(lambda x: x[1], ascending=False)

for i in filtered_similar_items.collect():
    print(i)

('GGOEGAAX0104', 14.956170884422002)


In [461]:
# calculate the similarity using the N closest items

sumSR = 0   # similarity of items i and j * user's rating on item j
sumS = 0    # similarity of items i and j

for item in filtered_similar_items.collect():
    item_row = item_rdd.filter(lambda row: row.itemID == item[0]).collect()[0]
    S = item[1]
    R = item_row[userID]
    sumSR += float(S)*float(R)
    sumS += float(S)

rating = sumSR/sumS

print(rating)

-0.19552529357072876
