In [1]:
from pyspark import SparkContext
import numpy as np
sc = SparkContext.getOrCreate()

In [10]:
trainSet = sc.textFile("../train.csv")
trainSet = trainSet.map(lambda l: l.split(','))\
                    .filter(lambda x: 'userId' not in x)\
                    .map(lambda x: (int(x[0]), int(x[1]), int(x[2])))
trainSet.take(5)

[(2738, 1, 1), (4716, 1, 4), (13298, 1, 8), (15122, 1, 4), (11326, 2, 5)]

In [15]:
globalBias = trainSet.map(lambda x: x[2]).mean()
print("Global bias: ",globalBias)

Global bias:  6.8188763965700625


In [17]:
trainSetR2 = trainSet.map(lambda x: (int(x[0]), int(x[1]), int(x[2]) - globalBias))
trainSetR2.take(5)

[(2738, 1, -5.8188763965700625),
 (4716, 1, -2.8188763965700625),
 (13298, 1, 1.1811236034299375),
 (15122, 1, -2.8188763965700625),
 (11326, 2, -1.8188763965700625)]

In [19]:
itemSet = trainSetR2.map(lambda x: (x[1], x[2]))
itemSet.take(5)

[(1, -5.8188763965700625),
 (1, -2.8188763965700625),
 (1, 1.1811236034299375),
 (1, -2.8188763965700625),
 (2, -1.8188763965700625)]

In [28]:
cost = 7
itemCount = itemSet.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y)
itemCountDict = itemCount.collectAsMap()
itemSetBias = itemSet.reduceByKey(lambda x, y: x + y)\
                            .map(lambda x: (x[0], x[1]/(itemCountDict[x[0]]+cost)))
itemSetBiasDict = itemSetBias.collectAsMap()

In [29]:
trainSetR3 = trainSetR2.map(lambda x: (x[0], x[1], x[2] - itemSetBiasDict[x[1]]))
trainSetR3.take(5)

[(2738, 1, -4.88473952509004),
 (4716, 1, -1.8847395250900396),
 (13298, 1, 2.1152604749099604),
 (15122, 1, -1.8847395250900396),
 (11326, 2, -1.5915168469988048)]

In [37]:
userSet = trainSetR3.map(lambda x: (x[0], x[2]))
userCount = userSet.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y)
userCountDict = userCount.collectAsMap()
userSetBias = userSet.reduceByKey(lambda x, y: x + y)\
                            .map(lambda x: (x[0], x[1]/(userCountDict[x[0]]+cost)))
userSetBiasDict = userSetBias.collectAsMap()
userSetBias.take(5)

[(2, -0.2454719099142516),
 (4, 0.31328785880246485),
 (8, 0.627468916804455),
 (10, -0.10493616143742794),
 (12, 0.4634040796051744)]

In [39]:
numberOfRecommendations = 5
itemSetArray = np.array(itemCount.map(lambda x: x[0]).collect())

seenItems= trainSet.map(lambda x: (x[0],[x[1]])).reduceByKey(lambda x,y: x + y)
seenItemsDict=seenItems.collectAsMap()

def recommendUser(userId):
    bias1= globalBias + userSetBiasDict[userId]
    recommendedItems = []
    for item in itemSetArray:
        rank = bias1 + itemSetBiasDict[item]
        if item not in seenItemsDict[userId]:
            if len(recommendedItems) < numberOfRecommendations:
                recommendedItems.append((item, rank))
            else:
                minRank = min(recommendedItems, key = lambda t: t[1])
                if rank > minRank[1]:
                    recommendedItems = filter(recommendedItems, lambda x: x[0] != minRank[0])
                    recommendedItems.append((item, rank))
