In [21]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [226]:
import pandas as pd
import math

In [33]:
# flip the rows and columns in the csv (since I need item rows for item-item)

transposed_csv = pd.read_csv('../data/utility.csv', header=None).T
transposed_csv[0][0] = "itemID"
transposed_csv.to_csv('../data/transposed_utility.csv', header=False, index=False)

In [309]:
from pyspark.sql import SparkSession, functions as F, Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark session
def init_spark():
    return SparkSession \
        .builder \
        .appName("GA360RECOMMENDER") \
        .getOrCreate()

spark = init_spark()

In [310]:
# import the utility matrix=
df = spark.read.csv("../data/transposed_utility.csv", header=True)

In [400]:
userID = input("Enter the user ID: ") 
itemID = input("Enter the item ID: ") 

Enter the user ID:  6649661547402538462
Enter the item ID:  GGOEGAAX0338


In [402]:
# switch the df into an rdd where each element is an item
item_rdd = df.rdd

In [403]:
# this function will execute for each item to find its average
def calculate_avg(row):
    # Extract values from the row
    item_id = row.itemID
    item_ratings = [row[key] for key in row.__fields__ if key != "itemID" and row[key] is not None]

    total = 0
    if len(item_ratings) > 0:
        for r in item_ratings:
            total = total + float(r)
        avg = total/len(item_ratings)
    else:
        avg = None
        
    # Create a new Row with itemId and userAverage
    return (item_id, avg)

In [404]:
# get the average rating for each item. Convert to a dictionary because you can't map inside a map (for the similarity function)
average_ratings = item_rdd.map(calculate_avg)
average_ratings_dict = dict(average_ratings.collect())

In [405]:
# get info on the target item
item_average = average_ratings.lookup(itemID)[0]
item_row = item_rdd.filter(lambda x: x['itemID'] == itemID).collect()[0]

In [406]:
# use Pearson correlation coefficient to define similarity between two items
def pearson_correlation(row):
    global item_row
    rowx = item_row
    rowy = row

    sumProdDiffXY = 0
    sumDiffXSquared = 0
    sumDiffYSquared = 0
    
    global average_ratings_dict
    global item_average
    item_x_average = item_average
    item_y_average = average_ratings_dict[str(rowy['itemID'])]
    
    # go through each user rating
    for key in rowx.__fields__:
        # only include the user rating if they rated both items
        if rowx[key] is not None and rowy[key] is not None and key != 'itemID':
            sumProdDiffXY += (float(rowx[key]) - float(item_x_average)) * (float(rowy[key]) - float(item_y_average))
            sumDiffXSquared += (float(rowx[key]) - float(item_x_average)) ** 2
            sumDiffYSquared += (float(rowy[key]) - float(item_y_average)) ** 2

    denominator = math.sqrt(sumDiffXSquared) * math.sqrt(sumDiffXSquared)
    if denominator != 0 and str(rowx['itemID']) != str(rowy['itemID']):
        similarity = sumProdDiffXY / denominator
    else:
        similarity = None
        
    return similarity

In [407]:
# get all items by their similarity to the target item
# format (itemId, similarity)
all_similarities = item_rdd.map(lambda x: (x.itemID, pearson_correlation(x)))

for i in all_similarities.collect():
    print(i)   

In [433]:
# get a list of items the user has rated
rated_items = item_rdd.filter(lambda x: x[userID is not None]) 

for i in rated_items.collect():
    print(i)

Row(itemID='GGOEGOLC013299', 1311003026247678016='-0.12517336278808078', 1059770176518504832=None, 2608714709593599798=None, 1633146081774573328=None, 051121380870457542=None, 0750846065342433129=None, 5768779985718104713=None, 9644838183145418265=None, 102703605492733588=None, 0826150033557882055=None, 5074660303190613667=None, 9846359776693659031=None, 5705874741137472356=None, 3653223981913568754=None, 0716845898585666271='-0.1345289319183401', 1868979439923677894=None, 4193206519109888601=None, 8882654778626829762=None, 1627674454679945564=None, 9064503463693769960=None, 454845221896711463=None, 811215767847921598=None, 5334519922113386772=None, 431781159932899381=None, 7083651462602256215=None, 8885422469790736188=None, 140515510840599351=None, 1904044281872619820=None, 2510170651126670568=None, 5918721989053447180=None, 0241048761582607706=None, 5187966439034093084=None, 9386651167038033508=None, 3944421154392364485=None, 5883164011639311823=None, 1487642479039977302=None, 698297

In [430]:
# get the N closest values that the user has rated
N = 3
filtered_similar_items = all_similarities.filter(lambda x: x[1] is not None) \
    .sortBy(lambda x: x[1], ascending=False)

for i in filtered_similar_items.collect():
    print(i)



('GGOEGFKA022299', 1.9443905394340126)
('GGOEGAAX0081', 1.504100067683657)
('GGOEGAAX0338', 1.0001171447909616)
('GGOEAFKQ020599', 0.7856754698807724)
('GGOEGAAX0304', 0.6933800177018793)
('GGOEGFKQ020399', 0.40051804029780813)
('GGOEGDHC015299', 0.3133548941265277)
('GGOEGFKQ020799', 0.2597190126996969)
('GGOEACCQ017299', 0.14842245014838343)
('GGOEGAAX0366', 0.11948768678086108)
('GGOEGAAX0323', 0.0)
('GGOEGAAX0326', 0.0)
('GGOEGAAX0341', -0.008043942312698405)
('GGOEGODR017799', -0.20427708647888793)
('GGOEAFKQ020499', -0.8448248034570728)
('GGOEGEVA022399', -1.4334641992502728)
('GGOEGAAX0074', -1.828454469724579)


In [426]:
# calculate the similarity using the N closest items

sumSR = 0   # similarity of items i and j * user's rating on item j
sumS = 0    # similarity of items i and j

for item in filtered_similarities:
    S = item[1]
    print("S: " + str(S))
    item_row = item_rdd.filter(lambda row: row.itemID == item[0]).collect()[0]
    R = item_row[userID]
    print(item_row)

# for i in item_rdd.collect():
#     print(i)

S: 14.3601520100993
Row(itemID='GGOEGAAX0356', 1311003026247678016=None, 1059770176518504832=None, 2608714709593599798=None, 1633146081774573328=None, 051121380870457542=None, 0750846065342433129=None, 5768779985718104713=None, 9644838183145418265=None, 102703605492733588=None, 0826150033557882055=None, 5074660303190613667=None, 9846359776693659031=None, 5705874741137472356=None, 3653223981913568754=None, 0716845898585666271=None, 1868979439923677894=None, 4193206519109888601=None, 8882654778626829762=None, 1627674454679945564=None, 9064503463693769960=None, 454845221896711463=None, 811215767847921598=None, 5334519922113386772=None, 431781159932899381=None, 7083651462602256215=None, 8885422469790736188=None, 140515510840599351=None, 1904044281872619820=None, 2510170651126670568=None, 5918721989053447180=None, 0241048761582607706=None, 5187966439034093084=None, 9386651167038033508=None, 3944421154392364485=None, 5883164011639311823=None, 1487642479039977302=None, 698297406423033664=None