In [42]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import numpy as np
from pyspark.mllib.linalg.distributed import *
from pyspark.mllib.linalg import *
from scipy.sparse import *
from collections import defaultdict
import pdb
from itertools import *
import operator
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [2]:
icm = sc.textFile("icm.csv")
icm = icm.map(lambda l: l.split(','))\
            .filter(lambda line: line[0] != 'itemId')\
            .map(lambda x: (int(x[0]), int(x[1]), 1))
trainSet = sc.textFile("train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
trainSet = trainSet.map(lambda line: (int(line[0]), int(line[1]), int(line[2])))
#FOR THE TOP POP
itemSet = trainSet.map(lambda x: (x[1], x[2]))
itemsCount = trainSet.map(lambda x: (x[1],1)).reduceByKey(lambda x,y : x + y)
itemsCount_dict = itemsCount.collectAsMap()
#-----------------------------------------
featureFreq = icm.map(lambda x: (x[1],1)).reduceByKey(lambda x, y: x + y)
featureFreqDict = featureFreq.collectAsMap()
prodCount= icm.map(lambda x: x[0]).distinct().count()
featureIdf = featureFreq.map(lambda x: (x[0],np.log10(prodCount/x[1])))
featureIdfDict = featureIdf.collectAsMap()

targetUsers = sc.textFile("target_user.csv").filter(lambda x: "userId" not in x).map(lambda x: int(x))
targets=targetUsers.collect()

######
#TEST COMPUTING THE PREDICTION NORMALIZING BY THE N OF FEATURES AND NOT THE SQRT OF IT
norms = icm.map(lambda x: (x[0],1))\
                .reduceByKey(lambda x, y: x+y).mapValues(lambda x: np.sqrt(x))\
                .collectAsMap()

normalized = icm.map(lambda x: (x[0], x[1], x[2]/norms[x[0]]))

In [3]:
#Just consider the row of the users to predict
#IF you0re asking, only the idf depends on all the training set, in fact it is computed before reducing the trainset
trainSet=trainSet.filter(lambda x: x[0] in targets)
print(trainSet.count())
data = trainSet.map(lambda x: x[2]).collect()
rows = trainSet.map(lambda x: x[0]).collect()
cols = trainSet.map(lambda x: x[1]).collect()
data.append(0)
rows.append(15364)
cols.append(37142)
userItem=csr_matrix((data,(rows,cols)))
print("userItem shape:",userItem.shape)
data = normalized.map(lambda x: x[2]).collect()
rows = normalized.map(lambda x: x[0]).collect()
cols = normalized.map(lambda x: x[1]).collect()
data.append(0)
rows.append(37142)
cols.append(80)
itemFeature = csc_matrix((data,(rows,cols)))
print("itemFeat shape:",itemFeature.shape)

46750
userItem shape: (15365, 37143)
itemFeat shape: (37143, 19716)


In [4]:
userFeature = userItem.dot(itemFeature)
userFeature.shape
seenItems= trainSet.map(lambda x: (x[0],[x[1]])).reduceByKey(lambda x,y: x + y)
seenItemsDict=seenItems.collectAsMap()

In [5]:
data = []
rows = []
cols = []
for f in featureIdfDict.keys():
    data.append(featureIdfDict[f])
    cols.append(f)
    rows.append(f)
featureIdf = csr_matrix((data,(rows,cols)))
featureIdf.shape

(19716, 19716)

In [6]:
userProfile = userFeature.dot(featureIdf)
prediction = userProfile.dot(itemFeature.transpose())
prediction.shape

(15365, 37143)

In [7]:
numberOfRecommendations=5
#TOP POPULAR
cost=8
avgRatings=itemSet.reduceByKey(lambda x,y: x+y)
avgRatings=avgRatings.map(lambda x: (x[0],x[1]/(itemsCount_dict[x[0]]+cost)))
avgRatings.take(5)
itemOrderByPop=avgRatings.sortBy(lambda x: x[1], ascending=False)
itemPop = np.array(itemOrderByPop.map(lambda x: x[0]).collect())
#--------------------------------------------------------------

def recommendTopPop(user_id, removeSeen=True):
    seenItems = np.array(seenItemsDict[user_id])
    recommendedList = itemPop
    if(removeSeen):
        unseen_mask = np.in1d(recommendedList, seenItems, invert=True)
        recommendedList = recommendedList[unseen_mask]       
    return recommendedList[:numberOfRecommendations]


def fillWithTopPop(recommended,user):
    TopPop=recommendTopPop(user)
    for i in range (numberOfRecommendations-len(recommended)):
        recommended.append(TopPop[i])
    return recommended


def getRecommended(user):
    recommended = []
    itemsPred = prediction.getrow(user).toarray()[0]

    for i in range(0,len(itemsPred)):
        if(itemsPred[i]!=0):
                if i not in seenItemsDict[user]:
                    recommended.append((i, itemsPred[i]))
    recommended.sort(key = lambda x: -x[1])
    recommended=recommended[:100]
    recommendedItems = list(map(lambda x: x[0], recommended))
    if(len(recommendedItems)<numberOfRecommendations):
        recommendedItems=fillWithTopPop(recommendedItems, user)
    return recommendedItems    



In [8]:
def parseVectorOnUser(line):
    '''
    Parse each line of the specified data file, assuming a "|" delimiter.
    Key is user_id, converts each rating to a float.
    '''
    return line[0],(line[1],float(line[2]))

def parseVectorOnItem(line):
    '''
    Parse each line of the specified data file, assuming a "|" delimiter.
    Key is item_id, converts each rating to a float.
    '''
    return line[1],(line[0],float(line[2]))

def sampleInteractions(item_id,users_with_rating,n):
    '''
    For items with # interactions > n, replace their interaction history
    with a sample of n users_with_rating
    '''
    if len(users_with_rating) > n:
        return item_id, random.sample(users_with_rating,n)
    else:
        return item_id, users_with_rating
    
def findUserPairs(item_id,users_with_rating):
    '''
    For each item, find all user-user pairs combos. (i.e. users with the same item) 
    '''
    l = []
    for user1,user2 in permutations(users_with_rating,2):
        l.append(((user1[0],user2[0]),(user1[1],user2[1])))
    return l

def keyOnFirstUser(user_pair,item_sim_data):
    '''
    For each user-user pair, make the first user's id the key
    '''
    (user1_id,user2_id) = user_pair
    return user1_id,(user2_id,item_sim_data)

def calcSim(user_pair,rating_pairs, shrink):
    ''' 
    For each user-user pair, return the specified similarity measure,
    along with co_raters_count.
    '''
    sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
    
    for rating_pair in rating_pairs:
        sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])
        sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])
        sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])
        # sum_y += rt[1]
        # sum_x += rt[0]
        n += 1

    cos_sim = cosine(sum_xy,np.sqrt(sum_xx),np.sqrt(sum_yy), shrink)
    return user_pair, (cos_sim,n)

def cosine(dot_product,rating_norm_squared,rating2_norm_squared, shrink):
    '''
    The cosine between two vectors A, B
       dotProduct(A, B) / (norm(A) * norm(B))
    '''
    numerator = dot_product
    denominator = rating_norm_squared * rating2_norm_squared + shrink

    return (numerator / (float(denominator))) if denominator else 0.0

def nearestNeighbors(user,users_and_sims,n):
    '''
    Sort the predictions list by similarity and select the top-N neighbors
    '''
    
    users_and_sims.sort(key=lambda x: x[1][0],reverse=True)
    return user, users_and_sims[:n]

def topNRecommendations(user_id,user_sims,users_with_rating,seenDict,n, shrink):
    '''
    Calculate the top-N item recommendations for each user using the 
    weighted sums method
    '''

    # initialize dicts to store the score of each individual item,
    # since an item can exist in more than one item neighborhood
    totals = defaultdict(int)
    sim_sums = defaultdict(int)

    for (neighbor,(sim,count)) in user_sims:

        # lookup the item predictions for this neighbor
        unscored_items = users_with_rating.get(neighbor,None)

        if unscored_items:
            for (item,rating) in unscored_items:
                if item not in seenDict[user_id]:

                    # update totals and sim_sums with the rating data
                    totals[item] += sim * rating
                    sim_sums[item] += sim

    # create the normalized list of scored items 
    scored_items = [(total/(sim_sums[item]+shrink),item) for item,total in totals.items()]

    # sort the scored items in ascending order
    scored_items.sort(reverse=True)

    # take out the item score
    ranked_items = [x[1] for x in scored_items]

    return user_id,ranked_items[:n]

In [9]:
trainSet = sc.textFile("train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
trainSet = trainSet.map(lambda line: (int(line[0]), int(line[1]), int(line[2])))
meanVotePerUser = trainSet.map(lambda x: (x[0], (x[2], 1)))\
                        .reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))\
                        .map(lambda x: (x[0], x[1][0]/x[1][1]))
meanVotePerUserDict = meanVotePerUser.collectAsMap()
lines = trainSet.map(lambda x: (x[0], x[1], x[2] - meanVotePerUserDict[x[0]]))
lines.take(5)

[(2738, 1, -5.666666666666667),
 (4716, 1, -2.9283333333333337),
 (13298, 1, 0.9902439024390244),
 (15122, 1, -0.829787234042553),
 (11326, 2, -0.833333333333333)]

In [10]:
item_user_pairs = lines.map(parseVectorOnItem).groupByKey().cache()
print("Item: ",item_user_pairs.collect()[0][0], "- User Pairs: ", list(item_user_pairs.collect()[0][1]))

Item:  2 - User Pairs:  [(11326, -0.833333333333333)]


In [11]:
pairwise_users = item_user_pairs.filter(
        lambda p: len(p[1]) > 1).map(
        lambda p: findUserPairs(p[0],p[1])).flatMap(lambda x: x).groupByKey()
pairwise_users.take(5)

[((5577, 11743), <pyspark.resultiterable.ResultIterable at 0x7fbbf9878be0>),
 ((10954, 8456), <pyspark.resultiterable.ResultIterable at 0x7fbbf9878da0>),
 ((11607, 12099), <pyspark.resultiterable.ResultIterable at 0x7fbbf98784a8>),
 ((13446, 2418), <pyspark.resultiterable.ResultIterable at 0x7fbbf9878e48>),
 ((13446, 2936), <pyspark.resultiterable.ResultIterable at 0x7fbbf9878128>)]

In [12]:
user_sims = pairwise_users.map(lambda p: calcSim(p[0],p[1],7))\
                        .map(lambda p: keyOnFirstUser(p[0],p[1])).groupByKey()\
                        .map(lambda x : (x[0], list(x[1])))\
                        .map(lambda p: nearestNeighbors(p[0],p[1],50))
user_sims.take(5)

[(2418,
  [(443, (0.5139826422372227, 1)),
   (3729, (0.48764867337602924, 1)),
   (14236, (0.47240915208613726, 1)),
   (13942, (0.46869070208728647, 1)),
   (8148, (0.463013698630137, 1)),
   (3249, (0.40646528881823, 1)),
   (3910, (0.3987730061349693, 1)),
   (11465, (0.39167862266857967, 1)),
   (3123, (0.3858064516129032, 1)),
   (12162, (0.38174368313225593, 1)),
   (13309, (0.36136849607982896, 1)),
   (14823, (0.33659348310826426, 2)),
   (4134, (0.3170731707317073, 1)),
   (4147, (0.3149847094801223, 1)),
   (4702, (0.3040935672514621, 1)),
   (365, (0.29593810444874274, 1)),
   (2592, (0.28888888888888886, 1)),
   (7269, (0.2871583676525645, 1)),
   (12752, (0.2823998817792227, 1)),
   (8656, (0.2789699570815451, 1)),
   (12901, (0.278969957081545, 1)),
   (2732, (0.267538644470868, 1)),
   (820, (0.2631578947368421, 1)),
   (13665, (0.2582781456953642, 1)),
   (10885, (0.25600430822065284, 2)),
   (13270, (0.2502092050209205, 1)),
   (9060, (0.24773508278662917, 1)),
   (40

In [13]:
userSimsForTarget= user_sims.filter(lambda x: x[0] in targets)
userSimsForTarget.count()

l1=userSimsForTarget.map(lambda x: x[0]).collect()
l2=userSimsForTarget.filter(lambda x: x[1][0][1][0]==0.0).map(lambda x: x[0]).collect()
l3=[i for i in targets if i not in l1]
userForContent=l3+l2
print(len(userForContent))
userForCollaborative = userSimsForTarget.filter(lambda x: x[0] not in userForContent)
userForCollaborative.count()

1232


2964

In [14]:
user_item_hist = lines.map(parseVectorOnUser).groupByKey().collect()
ui_dict = {}
for (user,items) in user_item_hist: 
    ui_dict[user] = items

uib = sc.broadcast(ui_dict)
seenItemsDict = lines.map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x,y: x+y).collectAsMap()
'''
Calculate the top-N item recommendations for each user
    user_id -> [item1,item2,item3,...]
'''


user_item_recs_collaborative = userForCollaborative.map(lambda p: topNRecommendations(p[0],p[1],uib.value,seenItemsDict,100,7)).collect()
user_item_recs_content = [] 

for us in userForContent:
    user_item_recs_content.append((us, getRecommended(us)))

userRecsFinal = user_item_recs_collaborative + user_item_recs_content
userRecsFinal.sort(key = lambda x: x[0])
userRecsFinal

[(4,
  [32578,
   35061,
   98,
   30408,
   10129,
   20424,
   26942,
   4709,
   9793,
   8482,
   37030,
   14572,
   29330,
   5547,
   36637,
   5176,
   30892,
   10577,
   692,
   2148,
   29307,
   17913,
   23332,
   27004,
   652,
   8077,
   8261,
   3505,
   7961,
   16141,
   34168,
   146,
   8289,
   9840,
   15354,
   32775,
   6734,
   33526,
   24342,
   1885,
   27294,
   2041,
   20955,
   23949,
   20790,
   2344,
   10933,
   11023,
   34289,
   36984,
   4382,
   4807,
   8567,
   14767,
   5926,
   25397,
   33282,
   11754,
   15847,
   32400,
   30602,
   20384,
   31701,
   26588,
   6086,
   9893,
   23488,
   25758,
   12398,
   28028,
   14636,
   7179,
   17841,
   1455,
   1967,
   22357,
   22680,
   29276,
   11124,
   18479,
   36547,
   5963,
   3669,
   16713,
   19001,
   72,
   5280,
   5457,
   9489,
   10465,
   11371,
   14148,
   14363,
   15179,
   16839,
   17139,
   17414,
   21534,
   21785,
   23476]),
 (5,
  [269,
   28083,
   35158,
  

In [15]:
reccomandationsContent = []
for user in targets:
    reccomandationsContent.append((user, getRecommended(user)))
reccomandationsContent

[(4,
  [32578,
   35061,
   98,
   30408,
   10129,
   20424,
   26942,
   4709,
   9793,
   8482,
   37030,
   14572,
   29330,
   5547,
   36637,
   5176,
   30892,
   10577,
   692,
   2148,
   29307,
   17913,
   23332,
   27004,
   652,
   8077,
   8261,
   3505,
   7961,
   16141,
   34168,
   146,
   8289,
   9840,
   15354,
   32775,
   6734,
   33526,
   24342,
   1885,
   27294,
   2041,
   20955,
   23949,
   20790,
   2344,
   10933,
   11023,
   34289,
   36984,
   4382,
   4807,
   8567,
   14767,
   5926,
   25397,
   33282,
   11754,
   15847,
   32400,
   30602,
   20384,
   31701,
   26588,
   6086,
   9893,
   23488,
   25758,
   12398,
   28028,
   14636,
   7179,
   17841,
   1455,
   1967,
   22357,
   22680,
   29276,
   11124,
   18479,
   36547,
   5963,
   3669,
   16713,
   19001,
   72,
   5280,
   5457,
   9489,
   10465,
   11371,
   14148,
   14363,
   15179,
   16839,
   17139,
   17414,
   21534,
   21785,
   23476]),
 (5,
  [2762,
   10,
   11472,
   6

In [17]:
userFromContentRdd = sc.parallelize(reccomandationsContent)
userFromUserBasedRdd = sc.parallelize(userRecsFinal)
print(userFromContentRdd.count())
print(userFromUserBasedRdd.count())

4196
4196


In [18]:
def getIndexes(user, recs):
    l = []
    k = 1
    for i in recs:
        l.append((i,k))
        k += 1
    return (user, l)
userFromContentRdd = userFromContentRdd.map(lambda x: getIndexes(x[0],x[1]))
userFromUserBasedRdd = userFromUserBasedRdd.map(lambda x: getIndexes(x[0], x[1]))
userFromUserBasedRdd.take(1)

[(4,
  [(32578, 1),
   (35061, 2),
   (98, 3),
   (30408, 4),
   (10129, 5),
   (20424, 6),
   (26942, 7),
   (4709, 8),
   (9793, 9),
   (8482, 10),
   (37030, 11),
   (14572, 12),
   (29330, 13),
   (5547, 14),
   (36637, 15),
   (5176, 16),
   (30892, 17),
   (10577, 18),
   (692, 19),
   (2148, 20),
   (29307, 21),
   (17913, 22),
   (23332, 23),
   (27004, 24),
   (652, 25),
   (8077, 26),
   (8261, 27),
   (3505, 28),
   (7961, 29),
   (16141, 30),
   (34168, 31),
   (146, 32),
   (8289, 33),
   (9840, 34),
   (15354, 35),
   (32775, 36),
   (6734, 37),
   (33526, 38),
   (24342, 39),
   (1885, 40),
   (27294, 41),
   (2041, 42),
   (20955, 43),
   (23949, 44),
   (20790, 45),
   (2344, 46),
   (10933, 47),
   (11023, 48),
   (34289, 49),
   (36984, 50),
   (4382, 51),
   (4807, 52),
   (8567, 53),
   (14767, 54),
   (5926, 55),
   (25397, 56),
   (33282, 57),
   (11754, 58),
   (15847, 59),
   (32400, 60),
   (30602, 61),
   (20384, 62),
   (31701, 63),
   (26588, 64),
   (6086,

In [43]:
def bordaFunction(items):
    d = defaultdict(int)
    for (item, ind) in items:
        d[item] += ind
    keys = list(d.keys())
    values = list(d.values())
    l = list(zip(keys, values))
    l.sort(key=operator.itemgetter(1))
    return l

final = userFromContentRdd.join(userFromUserBasedRdd)
bordaScore = final.map(lambda x: (x[0], x[1][0]+x[1][1]))
bordaScoreReordered = bordaScore.map(lambda x: (x[0], bordaFunction(x[1])))
bordaScoreReordered.take(1)


[(32,
  [(25987, 1),
   (6461, 1),
   (17924, 2),
   (9735, 2),
   (28719, 3),
   (33173, 3),
   (15112, 4),
   (10701, 4),
   (22814, 5),
   (13726, 5),
   (1316, 6),
   (17152, 6),
   (20398, 7),
   (2911, 7),
   (35515, 8),
   (33598, 8),
   (18641, 9),
   (32340, 9),
   (5436, 10),
   (31239, 10),
   (10978, 11),
   (21108, 11),
   (27506, 12),
   (19565, 12),
   (19578, 13),
   (17030, 13),
   (32092, 14),
   (26658, 14),
   (2270, 15),
   (2143, 15),
   (20477, 16),
   (29915, 16),
   (21892, 17),
   (22211, 17),
   (25113, 18),
   (20620, 18),
   (9373, 19),
   (14319, 19),
   (23709, 20),
   (24512, 20),
   (20083, 21),
   (30982, 21),
   (4899, 22),
   (34517, 22),
   (7420, 23),
   (27502, 23),
   (22189, 24),
   (36872, 24),
   (23672, 25),
   (22850, 25),
   (3371, 26),
   (26473, 26),
   (35399, 27),
   (21304, 27),
   (29850, 28),
   (19022, 28),
   (21732, 29),
   (4279, 29),
   (22086, 30),
   (6643, 30),
   (24823, 31),
   (36543, 31),
   (7284, 32),
   (32347, 32),
  

In [51]:
numberOfReccomandations = 5
def numberOfRecc(items):
    return items[:5]

userRecFinal = bordaScoreReordered.map(lambda x: (x[0], numberOfRecc(x[1])))
userRecFinal = userRecFinal.sortByKey(lambda x: x[0])
userRecFinalList = userRecFinal.collect()

In [53]:
f=open("predictionsHB1.csv",'w')
i=0
for (user, items) in userRecFinalList:
    f.write(str(user)+',')
    for prod in items:
        f.write(str(prod[0])+' ')
    f.write('\n')
    i=i+1
    print(i,"of", len(userRecFinalList), "written")
f.close()

1 of 4196 written
2 of 4196 written
3 of 4196 written
4 of 4196 written
5 of 4196 written
6 of 4196 written
7 of 4196 written
8 of 4196 written
9 of 4196 written
10 of 4196 written
11 of 4196 written
12 of 4196 written
13 of 4196 written
14 of 4196 written
15 of 4196 written
16 of 4196 written
17 of 4196 written
18 of 4196 written
19 of 4196 written
20 of 4196 written
21 of 4196 written
22 of 4196 written
23 of 4196 written
24 of 4196 written
25 of 4196 written
26 of 4196 written
27 of 4196 written
28 of 4196 written
29 of 4196 written
30 of 4196 written
31 of 4196 written
32 of 4196 written
33 of 4196 written
34 of 4196 written
35 of 4196 written
36 of 4196 written
37 of 4196 written
38 of 4196 written
39 of 4196 written
40 of 4196 written
41 of 4196 written
42 of 4196 written
43 of 4196 written
44 of 4196 written
45 of 4196 written
46 of 4196 written
47 of 4196 written
48 of 4196 written
49 of 4196 written
50 of 4196 written
51 of 4196 written
52 of 4196 written
53 of 4196 written
54