In [100]:
from pyspark import SparkContext
import numpy as np
sc = SparkContext.getOrCreate()

In [101]:
icm = sc.textFile("../icm.csv")
icm = icm.map(lambda l: l.split(','))\
            .filter(lambda line: line[0] != 'itemId')\
            .map(lambda x: (int(x[0]), int(x[1])))
icm.take(5)

[(2135, 1), (2303, 1), (6292, 1), (12395, 1), (24520, 1)]

In [102]:
itemFeature = icm.map(lambda x: (x[0],[x[1]])).reduceByKey(lambda x,y: x+y)
itemFeatureDict = itemFeature.collectAsMap()
itemFeature.take(5)

[(32768, [2405, 12362]),
 (2, [9004, 9857, 15230, 15634, 18904, 19606]),
 (32772, [8922, 12236, 12512, 16761]),
 (6, [5787, 11496, 12412, 13223, 13749, 15634, 18904]),
 (8, [1540, 4807, 5778, 12362, 15397, 15634, 18904])]

In [103]:
#Compute the feature-frequency
featureFreq = icm.map(lambda x: (x[1],1)).reduceByKey(lambda x, y: x + y)
featureFreqDict = featureFreq.collectAsMap()
prodCount= icm.map(lambda x: x[0]).distinct().count()
print("ProdCount: ",prodCount)

featureIdf = featureFreq.map(lambda x: (x[0],np.log10(prodCount/x[1])))
featureIdfDict = featureIdf.collectAsMap()
featureIdf.takeOrdered(10,lambda x: -x[1])


ProdCount:  36797


[(12691, 4.5658124127888637),
 (16384, 4.2647824171248825),
 (2, 4.2647824171248825),
 (19116, 4.2647824171248825),
 (12, 4.2647824171248825),
 (14, 4.2647824171248825),
 (16, 4.2647824171248825),
 (20, 4.2647824171248825),
 (24, 4.2647824171248825),
 (32, 4.2647824171248825)]

In [104]:
def computeTfForFeature(features):
    count = len(features)
    tfs = []
    for i in features:
        tfs.append((i, (1/np.sqrt(count))))
    return tfs
itemFeatureTf = itemFeature.map(lambda x: (x[0], computeTfForFeature(x[1])))
itemFeatureTfDict = itemFeatureTf.collectAsMap()
print("TF for each item")
itemFeatureTf.take(5)

TF for each item


[(32768, [(2405, 0.70710678118654746), (12362, 0.70710678118654746)]),
 (2,
  [(9004, 0.40824829046386307),
   (9857, 0.40824829046386307),
   (15230, 0.40824829046386307),
   (15634, 0.40824829046386307),
   (18904, 0.40824829046386307),
   (19606, 0.40824829046386307)]),
 (32772, [(8922, 0.5), (12236, 0.5), (12512, 0.5), (16761, 0.5)]),
 (6,
  [(5787, 0.3779644730092272),
   (11496, 0.3779644730092272),
   (12412, 0.3779644730092272),
   (13223, 0.3779644730092272),
   (13749, 0.3779644730092272),
   (15634, 0.3779644730092272),
   (18904, 0.3779644730092272)]),
 (8,
  [(1540, 0.3779644730092272),
   (4807, 0.3779644730092272),
   (5778, 0.3779644730092272),
   (12362, 0.3779644730092272),
   (15397, 0.3779644730092272),
   (15634, 0.3779644730092272),
   (18904, 0.3779644730092272)])]

In [105]:
trainSet = sc.textFile("../train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
trainSet = trainSet.map(lambda line: (int(line[0]), int(line[1]), int(line[2])))
trainSet.take(5)

[(2738, 1, 1), (4716, 1, 4), (13298, 1, 8), (15122, 1, 4), (11326, 2, 5)]

In [106]:
def profileItem(item, rating):
    score = []
    if item in itemFeatureTfDict.keys():
        features = itemFeatureTfDict[item]
        for f in features:
            score.append((f[0], f[1]*rating))
    return score


In [107]:
userProfileItem = trainSet.map(lambda x: (x[0], profileItem(x[1], x[2])))
userProfileItem.cache() 
userProfileItem.take(10)

[(2738,
  [(876, 0.40824829046386307),
   (3523, 0.40824829046386307),
   (6396, 0.40824829046386307),
   (9004, 0.40824829046386307),
   (12361, 0.40824829046386307),
   (18904, 0.40824829046386307)]),
 (4716,
  [(876, 1.6329931618554523),
   (3523, 1.6329931618554523),
   (6396, 1.6329931618554523),
   (9004, 1.6329931618554523),
   (12361, 1.6329931618554523),
   (18904, 1.6329931618554523)]),
 (13298,
  [(876, 3.2659863237109046),
   (3523, 3.2659863237109046),
   (6396, 3.2659863237109046),
   (9004, 3.2659863237109046),
   (12361, 3.2659863237109046),
   (18904, 3.2659863237109046)]),
 (15122,
  [(876, 1.6329931618554523),
   (3523, 1.6329931618554523),
   (6396, 1.6329931618554523),
   (9004, 1.6329931618554523),
   (12361, 1.6329931618554523),
   (18904, 1.6329931618554523)]),
 (11326,
  [(9004, 2.0412414523193152),
   (9857, 2.0412414523193152),
   (15230, 2.0412414523193152),
   (15634, 2.0412414523193152),
   (18904, 2.0412414523193152),
   (19606, 2.0412414523193152)]),
 (3

In [108]:
def aggUserFeatures(featuresX, featuresY):
    finalList = []
    for elem in featuresX:
        notFound=True
        for elem2 in featuresY:
            if elem[0]==elem2[0]:
                notFound=False
                finalList.append((elem[0],elem[1]+elem2[1]))
                break
        if(notFound):
                finalList.append(elem)
    featureList = []
    for elem in finalList:
        featureList.append(elem[0])
    for elem in featuresY:
        if elem[0] not in featureList:
            finalList.append(elem)
    return finalList

userProfileItem=userProfileItem.reduceByKey(lambda x,y: aggUserFeatures(x,y))
userProfileItem.cache()
userProfileItem.take(5)

[(2,
  [(2405, 1.0606601717798212),
   (3903, 1.0606601717798212),
   (4117, 1.0606601717798212),
   (4850, 1.0606601717798212),
   (9003, 1.0606601717798212),
   (10653, 1.0606601717798212),
   (12362, 1.0606601717798212),
   (18096, 1.0606601717798212)]),
 (4,
  [(2482, 3.4016802570830449),
   (5527, 3.4016802570830449),
   (9849, 3.4016802570830449),
   (12899, 3.4016802570830449),
   (15634, 6.8033605141660898),
   (16783, 3.4016802570830449),
   (18904, 6.8033605141660898),
   (3626, 3.4016802570830449),
   (5787, 3.4016802570830449),
   (8152, 3.4016802570830449),
   (15058, 3.4016802570830449),
   (15623, 3.4016802570830449)]),
 (8,
  [(6154, 0.40824829046386307),
   (6514, 0.40824829046386307),
   (10250, 0.40824829046386307),
   (12362, 10.54096707485945),
   (15507, 0.40824829046386307),
   (18904, 38.008281456753281),
   (3828, 4.0824829046386304),
   (4346, 4.0824829046386304),
   (8922, 4.0824829046386304),
   (9122, 4.0824829046386304),
   (15200, 4.0824829046386304),
   

In [109]:
userProfileItem=userProfileItem.map(lambda x: (x[0],dict(x[1])))
userProfileItem.cache()

PythonRDD[2906] at RDD at PythonRDD.scala:48

In [110]:
seenItems= trainSet.map(lambda x: (x[0],[x[1]])).reduceByKey(lambda x,y: x + y)
seenItemsDict=seenItems.collectAsMap()
seenItemsDict[100]

users=trainSet.map(lambda x: x[0]).distinct().collect()
len(users)

15373

In [127]:
#FOR THE TOP POP
itemSet = trainSet.map(lambda x: (x[1], x[2]))
itemsCount = trainSet.map(lambda x: (x[1],1)).reduceByKey(lambda x,y : x + y)
itemsCount_dict = itemsCount.collectAsMap()

cost=7
avgRatings=itemSet.reduceByKey(lambda x,y: x+y)
avgRatings=avgRatings.map(lambda x: (x[0],x[1]/(itemsCount_dict[x[0]]+cost)))
avgRatings.take(5)
itemOrderByPop=avgRatings.sortBy(lambda x: x[1], ascending=False)
itemPop = np.array(itemOrderByPop.map(lambda x: x[0]).collect())


In [133]:
numberOfRecommendations=5

def recommendTopPop(user_id, removeSeen=True):
    seenItems = np.array(seenItemsDict[user_id])
    recommendedList = itemPop
    if(removeSeen):
        unseen_mask = np.in1d(recommendedList, seenItems, invert=True)
        recommendedList = recommendedList[unseen_mask]       
    return recommendedList[0:numberOfRecommendations]
def isUserNew(user):
    return user not in users

def fillWithTopPop(recommended,user):
    TopPop=recommendTopPop(user)
    for i in range (numberOfRecommendations-len(recommended)):
        recommended.append(TopPop[i])
    return recommended

def recommendedProduct(user):
    userAnalysis = userProfileItem.filter(lambda x: x[0] == user)
    userProfiles = userAnalysis.map(lambda x: x[1]).collect()[0]
    recommendedItems = []
    if isUserNew(user):
        return fillWithTopPop(recommendedItems, user)
    else:
        for item in itemFeatureDict:
            if item not in seenItemsDict[user]:
                itemFeatureTfDictNew = dict(itemFeatureTfDict[item])
                userFeatures = np.array(list(userProfiles.keys()))
                itemFeatures = itemFeatureDict[item]
                itemFeatures = np.array(itemFeatures)
                futuresCommon = np.in1d(userFeatures, itemFeatures, invert=False)
                commonFeatures = userFeatures[futuresCommon]
                if len(commonFeatures)>0:
                    rank = 0
                    for f in commonFeatures:
                        rank = rank + featureIdfDict[f] * itemFeatureTfDictNew[f] * userProfiles[f]
                    if len(recommendedItems) < numberOfRecommendations:
                        recommendedItems.append((item, rank))
                    else:
                        minRank = min(recommendedItems, key = lambda t: t[1])
                        if rank > minRank[1]:
                            recommendedItems = [t for t in recommendedItems if t[0] != minRank[0]]
                            recommendedItems.append((item, rank))
        #if less than 5 elements with common feature have been retrieved 
    recommendedItems = sorted(recommendedItems, key = lambda x: -x[1] )
    recommendedItems = list(map(lambda x: x[0], recommendedItems))
    if len(recommendedItems)<numberOfRecommendations:
            recommendedItems=fillWithTopPop(recommendedItems, user)
    return recommendedItems
                
            

In [134]:
recom=recommendedProduct(100)
print("Recc for user 100: ", recom)

Recc for user 100:  [12535, 4223, 11554, 20532, 19267]


In [135]:
toBeRecommend=sc.textFile('../target_user.csv').filter(lambda x: "userId" not in x).collect()
f=open("predictions.csv",'w')
i=0
for user in toBeRecommend:
    f.write(user+',')
    recommended=recommendedProduct(int(user))
    for prod in recommended:
        f.write(str(prod)+' ')
    f.write('\n')
    i=i+1
    print(i,"of", len(toBeRecommend), "written")
f.close()

1 of 4196 written
2 of 4196 written
3 of 4196 written
4 of 4196 written
5 of 4196 written
6 of 4196 written
7 of 4196 written
8 of 4196 written
9 of 4196 written
10 of 4196 written
11 of 4196 written
12 of 4196 written
13 of 4196 written
14 of 4196 written
15 of 4196 written
16 of 4196 written
17 of 4196 written
18 of 4196 written
19 of 4196 written
20 of 4196 written
21 of 4196 written
22 of 4196 written
23 of 4196 written
24 of 4196 written
25 of 4196 written
26 of 4196 written
27 of 4196 written
28 of 4196 written
29 of 4196 written
30 of 4196 written
31 of 4196 written
32 of 4196 written
33 of 4196 written
34 of 4196 written
35 of 4196 written
36 of 4196 written
37 of 4196 written
38 of 4196 written
39 of 4196 written
40 of 4196 written
41 of 4196 written
42 of 4196 written
43 of 4196 written
44 of 4196 written
45 of 4196 written
46 of 4196 written
47 of 4196 written
48 of 4196 written
49 of 4196 written
50 of 4196 written
51 of 4196 written
52 of 4196 written
53 of 4196 written
54

417 of 4196 written
418 of 4196 written
419 of 4196 written
420 of 4196 written
421 of 4196 written
422 of 4196 written
423 of 4196 written
424 of 4196 written
425 of 4196 written
426 of 4196 written
427 of 4196 written
428 of 4196 written
429 of 4196 written
430 of 4196 written
431 of 4196 written
432 of 4196 written
433 of 4196 written
434 of 4196 written
435 of 4196 written
436 of 4196 written
437 of 4196 written
438 of 4196 written
439 of 4196 written
440 of 4196 written
441 of 4196 written
442 of 4196 written
443 of 4196 written
444 of 4196 written
445 of 4196 written
446 of 4196 written
447 of 4196 written
448 of 4196 written
449 of 4196 written
450 of 4196 written
451 of 4196 written
452 of 4196 written
453 of 4196 written
454 of 4196 written
455 of 4196 written
456 of 4196 written
457 of 4196 written
458 of 4196 written
459 of 4196 written
460 of 4196 written
461 of 4196 written
462 of 4196 written
463 of 4196 written
464 of 4196 written
465 of 4196 written
466 of 4196 written


827 of 4196 written
828 of 4196 written
829 of 4196 written
830 of 4196 written
831 of 4196 written
832 of 4196 written
833 of 4196 written
834 of 4196 written
835 of 4196 written
836 of 4196 written
837 of 4196 written
838 of 4196 written
839 of 4196 written
840 of 4196 written
841 of 4196 written
842 of 4196 written
843 of 4196 written
844 of 4196 written
845 of 4196 written
846 of 4196 written
847 of 4196 written
848 of 4196 written
849 of 4196 written
850 of 4196 written
851 of 4196 written
852 of 4196 written
853 of 4196 written
854 of 4196 written
855 of 4196 written
856 of 4196 written
857 of 4196 written
858 of 4196 written
859 of 4196 written
860 of 4196 written
861 of 4196 written
862 of 4196 written
863 of 4196 written
864 of 4196 written
865 of 4196 written
866 of 4196 written
867 of 4196 written
868 of 4196 written
869 of 4196 written
870 of 4196 written
871 of 4196 written
872 of 4196 written
873 of 4196 written
874 of 4196 written
875 of 4196 written
876 of 4196 written


1226 of 4196 written
1227 of 4196 written
1228 of 4196 written
1229 of 4196 written
1230 of 4196 written
1231 of 4196 written
1232 of 4196 written
1233 of 4196 written
1234 of 4196 written
1235 of 4196 written
1236 of 4196 written
1237 of 4196 written
1238 of 4196 written
1239 of 4196 written
1240 of 4196 written
1241 of 4196 written
1242 of 4196 written
1243 of 4196 written
1244 of 4196 written
1245 of 4196 written
1246 of 4196 written
1247 of 4196 written
1248 of 4196 written
1249 of 4196 written
1250 of 4196 written
1251 of 4196 written
1252 of 4196 written
1253 of 4196 written
1254 of 4196 written
1255 of 4196 written
1256 of 4196 written
1257 of 4196 written
1258 of 4196 written
1259 of 4196 written
1260 of 4196 written
1261 of 4196 written
1262 of 4196 written
1263 of 4196 written
1264 of 4196 written
1265 of 4196 written
1266 of 4196 written
1267 of 4196 written
1268 of 4196 written
1269 of 4196 written
1270 of 4196 written
1271 of 4196 written
1272 of 4196 written
1273 of 4196 

1617 of 4196 written
1618 of 4196 written
1619 of 4196 written
1620 of 4196 written
1621 of 4196 written
1622 of 4196 written
1623 of 4196 written
1624 of 4196 written
1625 of 4196 written
1626 of 4196 written
1627 of 4196 written
1628 of 4196 written
1629 of 4196 written
1630 of 4196 written
1631 of 4196 written
1632 of 4196 written
1633 of 4196 written
1634 of 4196 written
1635 of 4196 written
1636 of 4196 written
1637 of 4196 written
1638 of 4196 written
1639 of 4196 written
1640 of 4196 written
1641 of 4196 written
1642 of 4196 written
1643 of 4196 written
1644 of 4196 written
1645 of 4196 written
1646 of 4196 written
1647 of 4196 written
1648 of 4196 written
1649 of 4196 written
1650 of 4196 written
1651 of 4196 written
1652 of 4196 written
1653 of 4196 written
1654 of 4196 written
1655 of 4196 written
1656 of 4196 written
1657 of 4196 written
1658 of 4196 written
1659 of 4196 written
1660 of 4196 written
1661 of 4196 written
1662 of 4196 written
1663 of 4196 written
1664 of 4196 

2008 of 4196 written
2009 of 4196 written
2010 of 4196 written
2011 of 4196 written
2012 of 4196 written
2013 of 4196 written
2014 of 4196 written
2015 of 4196 written
2016 of 4196 written
2017 of 4196 written
2018 of 4196 written
2019 of 4196 written
2020 of 4196 written
2021 of 4196 written
2022 of 4196 written
2023 of 4196 written
2024 of 4196 written
2025 of 4196 written
2026 of 4196 written
2027 of 4196 written
2028 of 4196 written
2029 of 4196 written
2030 of 4196 written
2031 of 4196 written
2032 of 4196 written
2033 of 4196 written
2034 of 4196 written
2035 of 4196 written
2036 of 4196 written
2037 of 4196 written
2038 of 4196 written
2039 of 4196 written
2040 of 4196 written
2041 of 4196 written
2042 of 4196 written
2043 of 4196 written
2044 of 4196 written
2045 of 4196 written
2046 of 4196 written
2047 of 4196 written
2048 of 4196 written
2049 of 4196 written
2050 of 4196 written
2051 of 4196 written
2052 of 4196 written
2053 of 4196 written
2054 of 4196 written
2055 of 4196 

2399 of 4196 written
2400 of 4196 written
2401 of 4196 written
2402 of 4196 written
2403 of 4196 written
2404 of 4196 written
2405 of 4196 written
2406 of 4196 written
2407 of 4196 written
2408 of 4196 written
2409 of 4196 written
2410 of 4196 written
2411 of 4196 written
2412 of 4196 written
2413 of 4196 written
2414 of 4196 written
2415 of 4196 written
2416 of 4196 written
2417 of 4196 written
2418 of 4196 written
2419 of 4196 written
2420 of 4196 written
2421 of 4196 written
2422 of 4196 written
2423 of 4196 written
2424 of 4196 written
2425 of 4196 written
2426 of 4196 written
2427 of 4196 written
2428 of 4196 written
2429 of 4196 written
2430 of 4196 written
2431 of 4196 written
2432 of 4196 written
2433 of 4196 written
2434 of 4196 written
2435 of 4196 written
2436 of 4196 written
2437 of 4196 written
2438 of 4196 written
2439 of 4196 written
2440 of 4196 written
2441 of 4196 written
2442 of 4196 written
2443 of 4196 written
2444 of 4196 written
2445 of 4196 written
2446 of 4196 

2790 of 4196 written
2791 of 4196 written
2792 of 4196 written
2793 of 4196 written
2794 of 4196 written
2795 of 4196 written
2796 of 4196 written
2797 of 4196 written
2798 of 4196 written
2799 of 4196 written
2800 of 4196 written
2801 of 4196 written
2802 of 4196 written
2803 of 4196 written
2804 of 4196 written
2805 of 4196 written
2806 of 4196 written
2807 of 4196 written
2808 of 4196 written
2809 of 4196 written
2810 of 4196 written
2811 of 4196 written
2812 of 4196 written
2813 of 4196 written
2814 of 4196 written
2815 of 4196 written
2816 of 4196 written
2817 of 4196 written
2818 of 4196 written
2819 of 4196 written
2820 of 4196 written
2821 of 4196 written
2822 of 4196 written
2823 of 4196 written
2824 of 4196 written
2825 of 4196 written
2826 of 4196 written
2827 of 4196 written
2828 of 4196 written
2829 of 4196 written
2830 of 4196 written
2831 of 4196 written
2832 of 4196 written
2833 of 4196 written
2834 of 4196 written
2835 of 4196 written
2836 of 4196 written
2837 of 4196 

3181 of 4196 written
3182 of 4196 written
3183 of 4196 written
3184 of 4196 written
3185 of 4196 written
3186 of 4196 written
3187 of 4196 written
3188 of 4196 written
3189 of 4196 written
3190 of 4196 written
3191 of 4196 written
3192 of 4196 written
3193 of 4196 written
3194 of 4196 written
3195 of 4196 written
3196 of 4196 written
3197 of 4196 written
3198 of 4196 written
3199 of 4196 written
3200 of 4196 written
3201 of 4196 written
3202 of 4196 written
3203 of 4196 written
3204 of 4196 written
3205 of 4196 written
3206 of 4196 written
3207 of 4196 written
3208 of 4196 written
3209 of 4196 written
3210 of 4196 written
3211 of 4196 written
3212 of 4196 written
3213 of 4196 written
3214 of 4196 written
3215 of 4196 written
3216 of 4196 written
3217 of 4196 written
3218 of 4196 written
3219 of 4196 written
3220 of 4196 written
3221 of 4196 written
3222 of 4196 written
3223 of 4196 written
3224 of 4196 written
3225 of 4196 written
3226 of 4196 written
3227 of 4196 written
3228 of 4196 

3572 of 4196 written
3573 of 4196 written
3574 of 4196 written
3575 of 4196 written
3576 of 4196 written
3577 of 4196 written
3578 of 4196 written
3579 of 4196 written
3580 of 4196 written
3581 of 4196 written
3582 of 4196 written
3583 of 4196 written
3584 of 4196 written
3585 of 4196 written
3586 of 4196 written
3587 of 4196 written
3588 of 4196 written
3589 of 4196 written
3590 of 4196 written
3591 of 4196 written
3592 of 4196 written
3593 of 4196 written
3594 of 4196 written
3595 of 4196 written
3596 of 4196 written
3597 of 4196 written
3598 of 4196 written
3599 of 4196 written
3600 of 4196 written
3601 of 4196 written
3602 of 4196 written
3603 of 4196 written
3604 of 4196 written
3605 of 4196 written
3606 of 4196 written
3607 of 4196 written
3608 of 4196 written
3609 of 4196 written
3610 of 4196 written
3611 of 4196 written
3612 of 4196 written
3613 of 4196 written
3614 of 4196 written
3615 of 4196 written
3616 of 4196 written
3617 of 4196 written
3618 of 4196 written
3619 of 4196 

3963 of 4196 written
3964 of 4196 written
3965 of 4196 written
3966 of 4196 written
3967 of 4196 written
3968 of 4196 written
3969 of 4196 written
3970 of 4196 written
3971 of 4196 written
3972 of 4196 written
3973 of 4196 written
3974 of 4196 written
3975 of 4196 written
3976 of 4196 written
3977 of 4196 written
3978 of 4196 written
3979 of 4196 written
3980 of 4196 written
3981 of 4196 written
3982 of 4196 written
3983 of 4196 written
3984 of 4196 written
3985 of 4196 written
3986 of 4196 written
3987 of 4196 written
3988 of 4196 written
3989 of 4196 written
3990 of 4196 written
3991 of 4196 written
3992 of 4196 written
3993 of 4196 written
3994 of 4196 written
3995 of 4196 written
3996 of 4196 written
3997 of 4196 written
3998 of 4196 written
3999 of 4196 written
4000 of 4196 written
4001 of 4196 written
4002 of 4196 written
4003 of 4196 written
4004 of 4196 written
4005 of 4196 written
4006 of 4196 written
4007 of 4196 written
4008 of 4196 written
4009 of 4196 written
4010 of 4196 

In [None]:
print("Result of getRecommended:\n",getRecommended(151))
print("Top 5 from the matrix: " ,prediction.getrow(151).toarray()[0])