In [27]:
from pyspark import SparkContext , SparkConf
from pyspark.sql import SparkSession
import numpy as np
from pyspark.mllib.linalg.distributed import *
from pyspark.mllib.linalg import *
from collections import defaultdict
from scipy.sparse import *

In [2]:
conf = SparkConf().setAppName("App")
conf.set("spark.driver.memory", '20G')
#conf = (conf.setMaster('local[*]')
        #.set('spark.driver.memory', '4G')
sc = SparkContext(conf=conf)

In [3]:
trainSet = sc.textFile("train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
trainSet = trainSet.map(lambda line: (int(line[0]), int(line[1]), float(line[2])))

In [4]:
relevantsPerUser = trainSet.filter(lambda line: line[2] >= 8)\
                            .map(lambda line: (line[0], [line[1]]))\
                            .reduceByKey(lambda x, y: x + y)

In [28]:
relevantsDict = defaultdict(int)
relevantsDict = relevantsPerUser.collectAsMap()

In [42]:
import numpy as np

def ap(RankedList, PositiveItems, at=None):
    """
    Calculates AP@_ 
    """
    if(len(PositiveItems)==0):
        return 0
    print("Recomended: ", RankedList)
    RankedList = RankedList[:at]
    is_relevant = np.in1d(RankedList, PositiveItems, assume_unique=True)
    print("Positives: ", PositiveItems)
    print("is_relevant: ", is_relevant)   
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(len(is_relevant)))
    print("P at k :" , p_at_k)
    map_score = np.sum(p_at_k) / np.min([len(PositiveItems), len(RankedList)])
    print(map_score)
    assert 0 <= map_score <= 1, map_score
    
    return map_score

In [7]:
icm = sc.textFile("icm.csv")
icm = icm.map(lambda l: l.split(','))\
            .filter(lambda line: line[0] != 'itemId')\
            .map(lambda x: (int(x[0]), int(x[1]), 1))
#FOR THE TOP POP
itemSet = trainSet.map(lambda x: (x[1], x[2]))
itemsCount = trainSet.map(lambda x: (x[1],1)).reduceByKey(lambda x,y : x + y)
itemsCount_dict = itemsCount.collectAsMap()
#-----------------------------------------
featureFreq = icm.map(lambda x: (x[1],1)).reduceByKey(lambda x, y: x + y)
featureFreqDict = featureFreq.collectAsMap()
prodCount= icm.map(lambda x: x[0]).distinct().count()
featureIdf = featureFreq.map(lambda x: (x[0],np.log10(prodCount/x[1])))
featureIdfDict = featureIdf.collectAsMap()

targetUsers = sc.textFile("target_user.csv").filter(lambda x: "userId" not in x).map(lambda x: int(x))
targets=targetUsers.collect()

######
#TEST COMPUTING THE PREDICTION NORMALIZING BY THE N OF FEATURES AND NOT THE SQRT OF IT
norms = icm.map(lambda x: (x[0],1))\
                .reduceByKey(lambda x, y: x+y).mapValues(lambda x: np.sqrt(x))\
                .collectAsMap()

normalized = icm.map(lambda x: (x[0], x[1], x[2]/norms[x[0]]))

In [8]:
len(targets)

4196

In [9]:
#Just consider the row of the users to predict
#IF you0re asking, only the idf depends on all the training set, in fact it is computed before reducing the trainset
trainSet=trainSet.filter(lambda x: x[0] in targets)
data = trainSet.map(lambda x: x[2]).collect()
rows = trainSet.map(lambda x: x[0]).collect()
cols = trainSet.map(lambda x: x[1]).collect()
data.append(0)
rows.append(15364)
cols.append(37142)
userItem=csr_matrix((data,(rows,cols)))
data = normalized.map(lambda x: x[2]).collect()
rows = normalized.map(lambda x: x[0]).collect()
cols = normalized.map(lambda x: x[1]).collect()
data.append(0)
rows.append(37142)
cols.append(80)
itemFeature = csc_matrix((data,(rows,cols)))

In [10]:
userFeature = userItem.dot(itemFeature)
userFeature.shape

(15365, 19716)

In [11]:
data = []
rows = []
cols = []
for f in featureIdfDict.keys():
    data.append(featureIdfDict[f])
    cols.append(f)
    rows.append(f)
featureIdf = csr_matrix((data,(rows,cols)))
featureIdf.shape

(19716, 19716)

In [12]:
userProfile = userFeature.dot(featureIdf)
prediction = userProfile.dot(itemFeature.transpose())
prediction.shape

(15365, 37143)

In [50]:
numberOfRecommendations=5
#TOP POPULAR
cost=8
avgRatings=itemSet.reduceByKey(lambda x,y: x+y)
avgRatings=avgRatings.map(lambda x: (x[0],x[1]/(itemsCount_dict[x[0]]+cost)))
avgRatings.take(5)
itemOrderByPop=avgRatings.sortBy(lambda x: x[1], ascending=False)
itemPop = np.array(itemOrderByPop.map(lambda x: x[0]).collect())
seenItems= trainSet.map(lambda x: (x[0],[x[1]])).reduceByKey(lambda x,y: x + y)
seenItemsDict=seenItems.collectAsMap()
#--------------------------------------------------------------

def recommendTopPop(user_id, removeSeen=True):
    seenItems = np.array(seenItemsDict[user_id])
    recommendedList = itemPop
    if(removeSeen):
        unseen_mask = np.in1d(recommendedList, seenItems, invert=True)
        recommendedList = recommendedList[unseen_mask]       
    return recommendedList[:numberOfRecommendations]


def fillWithTopPop(recommended,user):
    TopPop=recommendTopPop(user)
    for i in range (numberOfRecommendations-len(recommended)):
        recommended.append(TopPop[i])
    return recommended


def getRecommended(user):
    recommended = []
    itemsPred = prediction.getrow(user).toarray()[0]

    for i in range(0,len(itemsPred)):
        if(itemsPred[i]!=0):
                #if i not in seenItemsDict[user]:
                recommended.append((i, itemsPred[i]))
                    
    recommended.sort(key = lambda x: -x[1])
    recommended=recommended[:numberOfRecommendations]
    recommendedItems = list(map(lambda x: x[0], recommended))
    if(len(recommendedItems)<numberOfRecommendations):
        recommendedItems=fillWithTopPop(recommendedItems, user)
    return recommendedItems    

In [51]:
recs = list(map(lambda x: (x,getRecommended(x)), targets))

In [52]:
print(len(recs))
print(recs[:10])

4196
[(4, [23217, 25656, 32578, 35061, 98]), (5, [20252, 2164, 17815, 3100, 2762]), (8, [10028, 10766, 14716, 23431, 27648]), (9, [31444, 3674, 30287, 26335, 30445]), (13, [9855, 5861, 7130, 15469, 25365]), (18, [17782, 17107, 19907, 18741, 33215]), (19, [16397, 24063, 19188, 10820, 12750]), (23, [36204, 4735, 33562, 22389, 24488]), (26, [9607, 13069, 23833, 24256, 28592]), (29, [14322, 24217, 21284, 1041, 31496])]


In [53]:
recsRDD = sc.parallelize(recs)

In [54]:
recsRDD.collect()

[(4, [23217, 25656, 32578, 35061, 98]),
 (5, [20252, 2164, 17815, 3100, 2762]),
 (8, [10028, 10766, 14716, 23431, 27648]),
 (9, [31444, 3674, 30287, 26335, 30445]),
 (13, [9855, 5861, 7130, 15469, 25365]),
 (18, [17782, 17107, 19907, 18741, 33215]),
 (19, [16397, 24063, 19188, 10820, 12750]),
 (23, [36204, 4735, 33562, 22389, 24488]),
 (26, [9607, 13069, 23833, 24256, 28592]),
 (29, [14322, 24217, 21284, 1041, 31496]),
 (31, [9123, 17126, 19798, 10963, 14820]),
 (32, [30653, 29308, 2404, 25987, 17924]),
 (33, [19370, 28346, 6539, 10620, 26098]),
 (35, [26585, 19987, 5366, 34141, 36350]),
 (37, [33282, 36409, 26550, 11465, 34168]),
 (51, [8216, 33266, 15973, 20941, 17207]),
 (52, [34713, 11012, 7859, 7164, 11557]),
 (53, [4359, 23160, 10701, 29833, 36872]),
 (54, [9535, 11645, 12586, 15614, 24183]),
 (61, [11717, 3680, 18081, 7083, 13782]),
 (64, [15973, 20941, 9894, 15086, 8216]),
 (69, [34851, 19028, 10245, 18105, 18186]),
 (76, [21368, 19020, 14583, 36163, 6942]),
 (78, [28199, 35300

In [55]:
MAP = recsRDD.map(lambda x: ap(x[1], relevantsDict.get(x[0],[]), at=5))

In [57]:
MapV = MAP.sum() / MAP.count()
print("MAP@5: {}".format(MapV))

MAP@5: 0.6386739354941214


In [56]:
ap(getRecommended(100), relevantsDict.get(100,[]), 5)

Recomended:  [8013, 30796, 12535, 4223, 11554]
Positives:  [8013, 30796]
is_relevant:  [ True  True False False False]
P at k : [ 1.  1.  0.  0.  0.]
1.0


1.0

0