In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import numpy as np
from pyspark.mllib.linalg.distributed import *
from pyspark.mllib.linalg import *
from scipy.sparse import *
from collections import defaultdict
from itertools import *
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [2]:
icm = sc.textFile("icm.csv")
icm = icm.map(lambda l: l.split(','))\
            .filter(lambda line: line[0] != 'itemId')\
            .map(lambda x: (int(x[0]), int(x[1]), 1))
trainSet = sc.textFile("train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
trainSet = trainSet.map(lambda line: (int(line[0]), int(line[1]), int(line[2])))
#FOR THE TOP POP
itemSet = trainSet.map(lambda x: (x[1], x[2]))
itemsCount = trainSet.map(lambda x: (x[1],1)).reduceByKey(lambda x,y : x + y)
itemsCount_dict = itemsCount.collectAsMap()
#-----------------------------------------
featureFreq = icm.map(lambda x: (x[1],1)).reduceByKey(lambda x, y: x + y)
featureFreqDict = featureFreq.collectAsMap()
prodCount= icm.map(lambda x: x[0]).distinct().count()
featureIdf = featureFreq.map(lambda x: (x[0],np.log10(prodCount/x[1])))
featureIdfDict = featureIdf.collectAsMap()

targetUsers = sc.textFile("target_user.csv").filter(lambda x: "userId" not in x).map(lambda x: int(x))
targets=targetUsers.collect()

######
#TEST COMPUTING THE PREDICTION NORMALIZING BY THE N OF FEATURES AND NOT THE SQRT OF IT
norms = icm.map(lambda x: (x[0],1))\
                .reduceByKey(lambda x, y: x+y).mapValues(lambda x: np.sqrt(x))\
                .collectAsMap()

normalized = icm.map(lambda x: (x[0], x[1], x[2]/norms[x[0]]))

In [3]:
#Just consider the row of the users to predict
#IF you0re asking, only the idf depends on all the training set, in fact it is computed before reducing the trainset
trainSet=trainSet.filter(lambda x: x[0] in targets)
print(trainSet.count())
data = trainSet.map(lambda x: x[2]).collect()
rows = trainSet.map(lambda x: x[0]).collect()
cols = trainSet.map(lambda x: x[1]).collect()
data.append(0)
rows.append(15364)
cols.append(37142)
userItem=csr_matrix((data,(rows,cols)))
print("userItem shape:",userItem.shape)
data = normalized.map(lambda x: x[2]).collect()
rows = normalized.map(lambda x: x[0]).collect()
cols = normalized.map(lambda x: x[1]).collect()
data.append(0)
rows.append(37142)
cols.append(80)
itemFeature = csc_matrix((data,(rows,cols)))
print("itemFeat shape:",itemFeature.shape)

46750
userItem shape: (15365, 37143)
itemFeat shape: (37143, 19716)


In [4]:
userFeature = userItem.dot(itemFeature)
userFeature.shape

(15365, 19716)

In [5]:
data = []
rows = []
cols = []
for f in featureIdfDict.keys():
    data.append(featureIdfDict[f])
    cols.append(f)
    rows.append(f)
featureIdf = csr_matrix((data,(rows,cols)))
featureIdf.shape

(19716, 19716)

In [6]:
userProfile = userFeature.dot(featureIdf)
prediction = userProfile.dot(itemFeature.transpose())
prediction.shape

(15365, 37143)

In [7]:
numberOfRecommendations=5
#TOP POPULAR
cost=8
avgRatings=itemSet.reduceByKey(lambda x,y: x+y)
avgRatings=avgRatings.map(lambda x: (x[0],x[1]/(itemsCount_dict[x[0]]+cost)))
avgRatings.take(5)
itemOrderByPop=avgRatings.sortBy(lambda x: x[1], ascending=False)
itemPop = np.array(itemOrderByPop.map(lambda x: x[0]).collect())
seenItems= trainSet.map(lambda x: (x[0],[x[1]])).reduceByKey(lambda x,y: x + y)
seenItemsDict=seenItems.collectAsMap()
#--------------------------------------------------------------

def recommendTopPop(user_id, removeSeen=True):
    seenItems = np.array(seenItemsDict[user_id])
    recommendedList = itemPop
    if(removeSeen):
        unseen_mask = np.in1d(recommendedList, seenItems, invert=True)
        recommendedList = recommendedList[unseen_mask]       
    return recommendedList[:numberOfRecommendations]


def fillWithTopPop(recommended,user):
    TopPop=recommendTopPop(user)
    for i in range (numberOfRecommendations-len(recommended)):
        recommended.append(TopPop[i])
    return recommended


def getRecommended(user):
    recommended = []
    itemsPred = prediction.getrow(user).toarray()[0]

    for i in range(0,len(itemsPred)):
        if(itemsPred[i]!=0):
                if i not in seenItemsDict[user]:
                    recommended.append((i, itemsPred[i]))
    recommended.sort(key = lambda x: -x[1])
    if len(recommended) != 0:
        maxUser = recommended[0][1]
        recommended = list(map(lambda x: (x[0],x[1]/maxUser*9),recommended))
    return recommended

In [8]:
#lines = sc.parallelize([(1,1,7),(1,2,6),(1,3,7),(1,4,4),(1,5,5),(1,6,4),(2,1,6),(2,2,7),(2,4,4),(2,5,3),(2,6,4)\
 #                      ,(3,2,3),(3,3,3),(3,4,1),(3,5,1),(4,1,1),(4,2,2),(4,3,2),(4,4,3),(4,5,3),(4,6,4),(5,1,1)\
#                    ,(5,3,1),(5,4,2),(5,5,3),(5,6,3)])

def parseVectorOnUser(line):
    '''
    Parse each line of the specified data file, assuming a "|" delimiter.
    Key is user_id, converts each rating to a float.
    '''
    return line[0],(line[1],float(line[2]))

def parseVectorOnItem(line):
    '''
    Parse each line of the specified data file, assuming a "|" delimiter.
    Key is item_id, converts each rating to a float.
    '''
    return line[1],(line[0],float(line[2]))

def sampleInteractions(item_id,users_with_rating,n):
    '''
    For items with # interactions > n, replace their interaction history
    with a sample of n users_with_rating
    '''
    if len(users_with_rating) > n:
        return item_id, random.sample(users_with_rating,n)
    else:
        return item_id, users_with_rating
    
def findUserPairs(item_id,users_with_rating):
    '''
    For each item, find all user-user pairs combos. (i.e. users with the same item) 
    '''
    l = []
    for user1,user2 in permutations(users_with_rating,2):
        l.append(((user1[0],user2[0]),(user1[1],user2[1])))
    return l

def keyOnFirstUser(user_pair,item_sim_data):
    '''
    For each user-user pair, make the first user's id the key
    '''
    (user1_id,user2_id) = user_pair
    return user1_id,(user2_id,item_sim_data)

def calcSim(user_pair,rating_pairs, shrink):
    ''' 
    For each user-user pair, return the specified similarity measure,
    along with co_raters_count.
    '''
    sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
    
    for rating_pair in rating_pairs:
        sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])
        sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])
        sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])
        # sum_y += rt[1]
        # sum_x += rt[0]
        n += 1

    cos_sim = cosine(sum_xy,np.sqrt(sum_xx),np.sqrt(sum_yy), shrink)
    return user_pair, (cos_sim,n)

def cosine(dot_product,rating_norm_squared,rating2_norm_squared, shrink):
    '''
    The cosine between two vectors A, B
       dotProduct(A, B) / (norm(A) * norm(B))
    '''
    numerator = dot_product
    denominator = rating_norm_squared * rating2_norm_squared + shrink

    return (numerator / (float(denominator))) if denominator else 0.0

def nearestNeighbors(user,users_and_sims,n):
    '''
    Sort the predictions list by similarity and select the top-N neighbors
    '''
    
    users_and_sims.sort(key=lambda x: x[1][0],reverse=True)
    return user, users_and_sims[:n]

def topNRecommendations(user_id,user_sims,users_with_rating,seenDict,n, shrink):
    '''
    Calculate the top-N item recommendations for each user using the 
    weighted sums method
    '''

    # initialize dicts to store the score of each individual item,
    # since an item can exist in more than one item neighborhood
    totals = defaultdict(int)
    sim_sums = defaultdict(int)

    for (neighbor,(sim,count)) in user_sims:
        if sim > 0:
            # lookup the item predictions for this neighbor
            unscored_items = users_with_rating.get(neighbor,None)

            if unscored_items:
                for (item,rating) in unscored_items:
                    if item not in seenDict[user_id]:
                        # update totals and sim_sums with the rating data
                        totals[item] += sim * rating
                        sim_sums[item] += sim

    # create the normalized list of scored items 
    scored_items = [(total/(sim_sums[item]+shrink),item) for item,total in totals.items()]

    # sort the scored items in ascending order
    scored_items.sort(reverse=True)

    # take out the item score
    #ranked_items = [x[1] for x in scored_items]

    return user_id,scored_items

In [9]:
trainSet = sc.textFile("train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
trainSet = trainSet.map(lambda line: (int(line[0]), int(line[1]), int(line[2])))
meanVotePerUser = trainSet.map(lambda x: (x[0], (x[2], 1)))\
                        .reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))\
                        .map(lambda x: (x[0], x[1][0]/x[1][1]))
meanVotePerUserDict = meanVotePerUser.collectAsMap()
lines = trainSet.map(lambda x: (x[0], x[1], x[2] - meanVotePerUserDict[x[0]]))
lines.take(5)

[(2738, 1, -5.666666666666667),
 (4716, 1, -2.9283333333333337),
 (13298, 1, 0.9902439024390244),
 (15122, 1, -0.829787234042553),
 (11326, 2, -0.833333333333333)]

In [10]:
item_user_pairs = lines.map(parseVectorOnItem).groupByKey().cache()
print("Item: ",item_user_pairs.collect()[0][0], "- User Pairs: ", list(item_user_pairs.collect()[0][1]))

Item:  2 - User Pairs:  [(11326, -0.833333333333333)]


In [11]:
pairwise_users = item_user_pairs.filter(
        lambda p: len(p[1]) > 1).map(
        lambda p: findUserPairs(p[0],p[1])).flatMap(lambda x: x).groupByKey()
pairwise_users.take(5)

[((5899, 11607), <pyspark.resultiterable.ResultIterable at 0x7f0c2092b400>),
 ((6827, 10197), <pyspark.resultiterable.ResultIterable at 0x7f0c2092b2e8>),
 ((10197, 6469), <pyspark.resultiterable.ResultIterable at 0x7f0c2092b390>),
 ((10199, 5899), <pyspark.resultiterable.ResultIterable at 0x7f0c2092b080>),
 ((10294, 8456), <pyspark.resultiterable.ResultIterable at 0x7f0c2092b208>)]

In [12]:
user_sims = pairwise_users.map(lambda p: calcSim(p[0],p[1],7))\
                        .map(lambda p: keyOnFirstUser(p[0],p[1])).groupByKey()\
                        .map(lambda x : (x[0], list(x[1])))\
                        .map(lambda p: nearestNeighbors(p[0],p[1],50))
user_sims.take(5)

[(8564,
  [(778, (0.8540284360189574, 1)),
   (9213, (0.8415377271058468, 4)),
   (11449, (0.7803250292933924, 1)),
   (10968, (0.7697565143101238, 1)),
   (5830, (0.7687370190303048, 3)),
   (4111, (0.7042925526185823, 2)),
   (4420, (0.687536231884058, 1)),
   (9849, (0.6874176987524407, 6)),
   (11470, (0.6874119652277632, 2)),
   (12734, (0.6852992991085537, 3)),
   (10443, (0.6755873691451172, 1)),
   (3849, (0.6755873691451172, 1)),
   (4540, (0.6718866517524235, 1)),
   (9645, (0.6633266533066132, 1)),
   (11071, (0.6613417526663838, 1)),
   (13386, (0.648101435690127, 2)),
   (10168, (0.6459444546624977, 1)),
   (12323, (0.6345068960116984, 5)),
   (14340, (0.6340365421537698, 4)),
   (2145, (0.6301821172620088, 8)),
   (8876, (0.6240918943341124, 2)),
   (10654, (0.6238276048267286, 4)),
   (10890, (0.6231174698795181, 2)),
   (10459, (0.6226811340567028, 1)),
   (13905, (0.6226811340567028, 1)),
   (13649, (0.6203982329214418, 1)),
   (2085, (0.619236673268118, 2)),
   (11126

In [13]:
user_item_hist = trainSet.map(parseVectorOnUser).groupByKey().collect()
ui_dict = {}
for (user,items) in user_item_hist: 
    ui_dict[user] = items

uib = sc.broadcast(ui_dict)
seenItemsDict = trainSet.map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x,y: x+y).collectAsMap()
'''
Calculate the top-N item recommendations for each user
    user_id -> [item1,item2,item3,...]
'''
userTarget = user_sims.filter(lambda x: x[0] in targets)

user_item_recs_collaborative = userTarget.map(lambda p: topNRecommendations(p[0],p[1],uib.value,seenItemsDict,0,1))

In [14]:
print(userTarget.count())

4143


In [15]:
data = []
cols = np.array(range(0,prediction.shape[0]))
rows = np.array(range(0,prediction.shape[0]))
for u in range(0,prediction.shape[0]):
    row = prediction.getrow(u).toarray()
    maxUser = row.max()
    if maxUser!=0:
        data.append(9/maxUser)
    else:
        data.append(maxUser)
print(len(data))
diagonal = csr_matrix((data,(rows,cols)))
diagonal.shape

15365


(15365, 15365)

In [16]:
norm = prediction.transpose().dot(diagonal)
matrixCont = norm.transpose()
matrixCont.shape

(15365, 37143)

In [17]:
user_item_recs_collaborative.take(5)

[(8564,
  [(6.686132129595772, 26743),
   (6.553525773737418, 22029),
   (6.158912831246097, 10625),
   (5.9762185861658175, 5488),
   (5.96140419847386, 32684),
   (5.863958467426336, 3366),
   (5.699277108008699, 31069),
   (5.636091715548008, 33598),
   (5.635035207244133, 8205),
   (5.6280064238480065, 31992),
   (5.6137675749095335, 31383),
   (5.5963211032602835, 36872),
   (5.584382693876118, 28102),
   (5.584382693876118, 10672),
   (5.584382693876118, 8206),
   (5.528399843100614, 2041),
   (5.517823566058045, 3431),
   (5.502171157994491, 13707),
   (5.481834932323988, 7815),
   (5.4398257019211, 17708),
   (5.4398257019211, 344),
   (5.433252532038259, 13701),
   (5.38417802104183, 20745),
   (5.367459469228766, 21589),
   (5.329517927638338, 58),
   (5.326460749322294, 36745),
   (5.306118387576927, 21717),
   (5.294012285246709, 13069),
   (5.286698410962541, 15530),
   (5.286698410962541, 6592),
   (5.286173278150543, 21601),
   (5.285528151471545, 22003),
   (5.281556986

In [18]:
def forMatrix(tupl):
    l = []
    for (rat,item) in tupl[1]:
        l.append((tupl[0],item,rat))
    return l
userColl = user_item_recs_collaborative.flatMap(lambda x: forMatrix(x))
userColl.take(5)

[(8564, 26743, 6.686132129595772),
 (8564, 22029, 6.553525773737418),
 (8564, 10625, 6.158912831246097),
 (8564, 5488, 5.9762185861658175),
 (8564, 32684, 5.96140419847386)]

In [19]:
rows = userColl.map(lambda x: x[0]).collect()
cols = userColl.map(lambda x: x[1]).collect()
data = userColl.map(lambda x: x[2]).collect()
matrixColl = csr_matrix((data,(rows,cols)))
matrixColl.shape

(15365, 37143)

In [22]:
finalMatrix = matrixColl.multiply(0.3) + matrixCont.multiply(0.7)
finalMatrix.shape

(15365, 37143)

In [23]:
def getRecommended(user):
    recommended = []
    itemsPred = finalMatrix.getrow(user).toarray()[0]

    for i in range(0,len(itemsPred)):
        if(itemsPred[i]!=0):
                if i not in seenItemsDict[user]:
                    recommended.append((i, itemsPred[i]))
    recommended.sort(key = lambda x: -x[1])
    recommended=recommended[:numberOfRecommendations]
    recommendedItems = list(map(lambda x: x[0], recommended))
    if(len(recommendedItems)<numberOfRecommendations):
        recommendedItems=fillWithTopPop(recommendedItems, user)
    return recommendedItems 

In [24]:
f=open("predictionsWeightedCont70UB30.csv",'w')
i=0
for user in targets:
    print(user)
    f.write("{}".format(user)+',')
    rec=getRecommended(user)
    print(rec)
    for prod in rec:
        f.write(str(prod) +' ')
    f.write('\n')
    i=i+1
    print(i,"of", len(targets), "written")
f.close()

4
[32578, 35061, 98, 30408, 10129]
1 of 4196 written
5
[10, 2762, 11472, 36190, 6321]
2 of 4196 written
8
[30279, 31290, 8134, 11157, 33443]
3 of 4196 written
9
[30287, 26335, 30445, 19426, 1265]
4 of 4196 written
13
[7130, 15469, 25365, 15016, 23137]
5 of 4196 written
18
[1448, 18741, 17644, 8227, 17207]
6 of 4196 written
19
[3465, 33944, 23786, 20374, 8399]
7 of 4196 written
23
[4735, 33562, 22389, 24488, 6821]
8 of 4196 written
26
[13069, 23833, 24256, 28592, 27139]
9 of 4196 written
29
[24217, 21284, 1041, 31496, 19215]
10 of 4196 written
31
[17126, 19798, 10963, 14820, 26723]
11 of 4196 written
32
[25987, 28719, 17924, 20620, 34351]
12 of 4196 written
33
[28346, 6539, 10620, 26098, 28738]
13 of 4196 written
35
[12866, 35300, 36350, 28199, 2911]
14 of 4196 written
37
[16016, 14351, 35391, 24488, 15623]
15 of 4196 written
51
[20941, 15973, 23336, 24547, 11707]
16 of 4196 written
52
[11557, 28294, 18717, 11745, 37101]
17 of 4196 written
53
[23282, 4708, 29833, 17925, 2597]
18 of 4196