In [2]:
pip install pandas




In [3]:
import pandas as pd
import math

In [9]:
# flip the rows and columns in the csv (since I need item rows for item-item)

transposed_csv = pd.read_csv('../../data/utility.csv', header=None).T
transposed_csv[0][0] = "itemID"
transposed_csv.to_csv('../../data/transposed_utility.csv', header=False, index=False)

In [10]:
from pyspark.sql import SparkSession, functions as F, Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark session
def init_spark():
    return SparkSession \
        .builder \
        .appName("GA360RECOMMENDER") \
        .getOrCreate()

spark = init_spark()

In [12]:
# import the utility matrix
df = spark.read.csv("../../data/transposed_utility.csv", header=True)

In [13]:
# choose which rating to predict
userID = input("Enter the user ID: ") 
itemID = input("Enter the item ID: ") 

Enter the user ID:  1589021726696497303
Enter the item ID:  GGOEGBRA037499


In [14]:
# switch the df into an rdd where each element is an item
item_rdd = df.rdd

In [15]:
# finds the average of a row
def calculate_avg(row):
    # Extract values from the row
    item_id = row.itemID
    item_ratings = [row[key] for key in row.__fields__ if key != "itemID" and row[key] is not None]

    total = 0
    if len(item_ratings) > 0:
        for r in item_ratings:
            total = total + float(r)
        avg = total/len(item_ratings)
    else:
        avg = None
        
    # Create a new Row with itemId and userAverage
    return (item_id, avg)

In [16]:
# get the average rating for each item (convert to a dict because you can't map inside a map (for the similarity function))
average_ratings = item_rdd.map(calculate_avg)
average_ratings_dict = dict(average_ratings.collect())

In [17]:
# target item info
item_average = average_ratings.lookup(itemID)[0]
item_row = item_rdd.filter(lambda x: x['itemID'] == itemID).collect()[0]

In [18]:
# finds pearson similarity between the target item and a given item
def pearson_correlation(row):
    global item_row
    rowx = item_row
    rowy = row

    sumProdDiffXY = 0
    sumDiffXSquared = 0
    sumDiffYSquared = 0
    
    global average_ratings_dict
    global item_average
    item_x_average = item_average
    item_y_average = average_ratings_dict[str(rowy['itemID'])]
    
    # go through each user rating
    for key in rowx.__fields__:
        # only include the user rating if they rated both items
        if rowx[key] is not None and rowy[key] is not None and key != 'itemID':
            sumProdDiffXY += (float(rowx[key]) - float(item_x_average)) * (float(rowy[key]) - float(item_y_average))
            sumDiffXSquared += (float(rowx[key]) - float(item_x_average)) ** 2
            sumDiffYSquared += (float(rowy[key]) - float(item_y_average)) ** 2

    denominator = math.sqrt(sumDiffXSquared) * math.sqrt(sumDiffYSquared)
    if denominator != 0 and str(rowx['itemID']) != str(rowy['itemID']):
        similarity = sumProdDiffXY / denominator
    else:
        similarity = None
        
    return similarity

In [19]:
# get items by their similarity to the target item
# format (itemId, similarity)
all_similarities = item_rdd.map(lambda x: (x.itemID, pearson_correlation(x))).filter(lambda x: x[1] is not None)

for i in all_similarities.collect():
    print(i)   

('GGOEAFKQ020499', -1.0)
('GGOEGAAX0074', -1.0)
('GGOEGAAX0081', -1.0)
('GGOEGAAX0104', 1.0)
('GGOEGAAX0299', 1.0)
('GGOEGAAX0341', 1.0)
('GGOEGAAX0361', 1.0)
('GGOEGAAX0596', -0.17743815304295385)
('GGOEGAAX0606', 1.0)
('GGOEGAAX0682', -1.0)
('GGOEGAAX0731', 1.0)
('GGOEGAAX0795', 0.9972633154442545)
('GGOEGBCR024399', -0.44381009451758613)
('GGOEGBJL013999', 1.0)
('GGOEGBMJ013399', 1.0)
('GGOEGBPB021199', 1.0)
('GGOEGBPB082099', -1.0)
('GGOEGBRB013899', -0.0195354103507071)
('GGOEGBRJ037299', 0.8869544197512934)
('GGOEGBRJ037399', 0.9765578249552678)
('GGOEGCMB020932', 0.9997404500048256)
('GGOEGDHC074099', -1.0)
('GGOEGDHQ015399', 0.999919180800602)
('GGOEGEVA022399', -1.0)
('GGOEGEVB070599', -1.0)
('GGOEGFKQ020399', 1.0)
('GGOEGFKQ020799', 0.49490736781167505)
('GGOEGHPJ080310', -1.0)
('GGOEGOAQ018099', 1.0)
('GGOEGOAR013099', 1.0)
('GGOEGOCC077299', 1.0)
('GGOEYDHJ056099', 1.0)
('GGOEYFKQ020699', 1.0)
('GGOEYOCR077799', 1.0)


In [20]:
# get a list of items the user has rated
rated_items = item_rdd.filter(lambda x: x[userID] is not None).map(lambda row: row.itemID).collect()

print(rated_items)

['GGOEACCQ017299', 'GGOEADHH055999', 'GGOEAFKQ020499', 'GGOEAFKQ020599', 'GGOEGAAX0074', 'GGOEGAAX0081', 'GGOEGAAX0304', 'GGOEGAAX0323', 'GGOEGAAX0326', 'GGOEGAAX0338', 'GGOEGAAX0341', 'GGOEGAAX0366', 'GGOEGEVA022399', 'GGOEGFKA022299', 'GGOEGFKQ020399', 'GGOEGFKQ020799', 'GGOEGODR017799']


In [21]:
# get the N closest values that the user has rated
N = 3
filtered_similar_items = all_similarities.filter(lambda x: x[0] in rated_items) \
    .sortBy(lambda x: x[1], ascending=False)

for i in filtered_similar_items.take(N):
    print(i)

('GGOEGAAX0341', 1.0)
('GGOEGFKQ020399', 1.0)
('GGOEGFKQ020799', 0.49490736781167505)


In [22]:
# calculate the similarity using the N closest items

sumSR = 0   # similarity of items i and j * user's rating on item j
sumS = 0    # similarity of items i and j

for item in filtered_similar_items.collect():
    item_row = item_rdd.filter(lambda row: row.itemID == item[0]).collect()[0]
    S = item[1]
    R = item_row[userID]
    sumSR += float(S)*float(R)
    sumS += float(S)

rating = sumSR/sumS

print(rating)

0.11218840771019728
