In [8]:
from pyspark import SparkContext
import numpy as np
sc = SparkContext.getOrCreate()

In [9]:
icm = sc.textFile("../icm.csv")
icm = icm.map(lambda l: l.split(','))\
            .filter(lambda line: line[0] != 'itemId')\
            .map(lambda x: (int(x[0]), int(x[1])))
icm.take(5)

[(2135, 1), (2303, 1), (6292, 1), (12395, 1), (24520, 1)]

In [10]:
itemFeature = icm.map(lambda x: (x[0],[x[1]])).reduceByKey(lambda x,y: x+y)
itemFeatureDict = itemFeature.collectAsMap()
itemFeature.take(5)

[(32768, [2405, 12362]),
 (2, [9004, 9857, 15230, 15634, 18904, 19606]),
 (32772, [8922, 12236, 12512, 16761]),
 (6, [5787, 11496, 12412, 13223, 13749, 15634, 18904]),
 (8, [1540, 4807, 5778, 12362, 15397, 15634, 18904])]

In [11]:
#Compute the feature-frequency
featureFreq = icm.map(lambda x: (x[1],1)).reduceByKey(lambda x, y: x + y)
featureFreqDict = featureFreq.collectAsMap()
prodCount= icm.map(lambda x: x[0]).distinct().count()
print("ProdCount: ",prodCount)

featureIdf = featureFreq.map(lambda x: (x[0],np.log10(prodCount/x[1])))
featureIdfDict = featureIdf.collectAsMap()
featureIdf.takeOrdered(10,lambda x: -x[1])


ProdCount:  36797


[(12691, 4.5658124127888637),
 (16384, 4.2647824171248825),
 (2, 4.2647824171248825),
 (19116, 4.2647824171248825),
 (12, 4.2647824171248825),
 (14, 4.2647824171248825),
 (16, 4.2647824171248825),
 (20, 4.2647824171248825),
 (24, 4.2647824171248825),
 (32, 4.2647824171248825)]

In [40]:
def computeTfForFeature(features):
    count = len(features)
    tfs = []
    for i in features:
        tfs.append((i, (1/np.sqrt(count))))
    return tfs
itemFeatureTf = itemFeature.map(lambda x: (x[0], computeTfForFeature(x[1])))
itemFeatureTfDict = itemFeatureTf.collectAsMap()
print("TF for each item")
itemFeatureTf.take(5)

TF for each item


[(32768, [(2405, 0.70710678118654746), (12362, 0.70710678118654746)]),
 (2,
  [(9004, 0.40824829046386307),
   (9857, 0.40824829046386307),
   (15230, 0.40824829046386307),
   (15634, 0.40824829046386307),
   (18904, 0.40824829046386307),
   (19606, 0.40824829046386307)]),
 (32772, [(8922, 0.5), (12236, 0.5), (12512, 0.5), (16761, 0.5)]),
 (6,
  [(5787, 0.3779644730092272),
   (11496, 0.3779644730092272),
   (12412, 0.3779644730092272),
   (13223, 0.3779644730092272),
   (13749, 0.3779644730092272),
   (15634, 0.3779644730092272),
   (18904, 0.3779644730092272)]),
 (8,
  [(1540, 0.3779644730092272),
   (4807, 0.3779644730092272),
   (5778, 0.3779644730092272),
   (12362, 0.3779644730092272),
   (15397, 0.3779644730092272),
   (15634, 0.3779644730092272),
   (18904, 0.3779644730092272)])]

In [13]:
trainSet = sc.textFile("../train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
trainSet = trainSet.map(lambda line: (int(line[0]), int(line[1]), int(line[2])))
trainSet.take(5)

[(2738, 1, 1), (4716, 1, 4), (13298, 1, 8), (15122, 1, 4), (11326, 2, 5)]

In [14]:
def profileItem(item, rating):
    score = []
    if item in itemFeatureTfDict.keys():
        features = itemFeatureTfDict[item]
        for f in features:
            score.append((f[0], f[1]*rating))
    return score


In [15]:
userProfileItem = trainSet.map(lambda x: (x[0], profileItem(x[1], x[2])))
userProfileItem.cache() 
userProfileItem.take(10)

[(2738,
  [(876, 0.40824829046386307),
   (3523, 0.40824829046386307),
   (6396, 0.40824829046386307),
   (9004, 0.40824829046386307),
   (12361, 0.40824829046386307),
   (18904, 0.40824829046386307)]),
 (4716,
  [(876, 1.6329931618554523),
   (3523, 1.6329931618554523),
   (6396, 1.6329931618554523),
   (9004, 1.6329931618554523),
   (12361, 1.6329931618554523),
   (18904, 1.6329931618554523)]),
 (13298,
  [(876, 3.2659863237109046),
   (3523, 3.2659863237109046),
   (6396, 3.2659863237109046),
   (9004, 3.2659863237109046),
   (12361, 3.2659863237109046),
   (18904, 3.2659863237109046)]),
 (15122,
  [(876, 1.6329931618554523),
   (3523, 1.6329931618554523),
   (6396, 1.6329931618554523),
   (9004, 1.6329931618554523),
   (12361, 1.6329931618554523),
   (18904, 1.6329931618554523)]),
 (11326,
  [(9004, 2.0412414523193152),
   (9857, 2.0412414523193152),
   (15230, 2.0412414523193152),
   (15634, 2.0412414523193152),
   (18904, 2.0412414523193152),
   (19606, 2.0412414523193152)]),
 (3

In [38]:
def aggUserFeatures(featuresX, featuresY):
    finalList = []
    for elem in featuresX:
        notFound=True
        for elem2 in featuresY:
            if elem[0]==elem2[0]:
                notFound=False
                finalList.append((elem[0],elem[1]+elem2[1]))
                break
        if(notFound):
                finalList.append(elem)
    featureList = []
    for elem in finalList:
        featureList.append(elem[0])
    for elem in featuresY:
        if elem[0] not in featureList:
            finalList.append(elem)
    return finalList

userProfileItem=userProfileItem.reduceByKey(lambda x,y: aggUserFeatures(x,y))

[(2,
  [(2405, 1.0606601717798212),
   (3903, 1.0606601717798212),
   (4117, 1.0606601717798212),
   (4850, 1.0606601717798212),
   (9003, 1.0606601717798212),
   (10653, 1.0606601717798212),
   (12362, 1.0606601717798212),
   (18096, 1.0606601717798212)]),
 (4,
  [(2482, 3.4016802570830449),
   (5527, 3.4016802570830449),
   (9849, 3.4016802570830449),
   (12899, 3.4016802570830449),
   (15634, 6.8033605141660898),
   (16783, 3.4016802570830449),
   (18904, 6.8033605141660898),
   (3626, 3.4016802570830449),
   (5787, 3.4016802570830449),
   (8152, 3.4016802570830449),
   (15058, 3.4016802570830449),
   (15623, 3.4016802570830449)]),
 (8,
  [(6154, 0.40824829046386307),
   (6514, 0.40824829046386307),
   (10250, 0.40824829046386307),
   (12362, 10.54096707485945),
   (15507, 0.40824829046386307),
   (18904, 38.008281456753281),
   (3828, 4.0824829046386304),
   (4346, 4.0824829046386304),
   (8922, 4.0824829046386304),
   (9122, 4.0824829046386304),
   (15200, 4.0824829046386304),
   

In [56]:
userProfiles=userProfileItem.map(lambda x: (x[0]))


In [65]:
userProfileItemU=userProfileItem.filter(lambda x: x[0]==5841)
userProfileItemU.take(5)


[(5841,
  [(15634, 5.0473917979696461),
   (18904, 5.0473917979696461),
   (500, 1.5118578920369088),
   (4715, 1.5118578920369088),
   (7974, 1.5118578920369088),
   (14586, 1.5118578920369088),
   (16965, 1.5118578920369088)])]

In [66]:
userProfileItemU=userProfileItemU.map(lambda x: (x[0],dict(x[1])))

In [67]:
userProfileItemU=userProfileItemU.map(lambda x: x[1][15634])
userProfileItemU.take(1)

[5.0473917979696461]

In [68]:
print("ciao")

ciao


In [69]:
userProfileItem.saveAsTextFile("userProfileItem.txt")