In [1]:
from pyspark import SparkContext
import numpy as np
sc = SparkContext.getOrCreate()

In [2]:
icm = sc.textFile("../icm.csv")
icm = icm.map(lambda l: l.split(','))\
            .filter(lambda line: line[0] != 'itemId')\
            .map(lambda x: (int(x[0]), int(x[1])))
icm.take(5)

[(2135, 1), (2303, 1), (6292, 1), (12395, 1), (24520, 1)]

In [3]:
itemFeature = icm.map(lambda x: (x[0],[x[1]])).reduceByKey(lambda x,y: x+y)
itemFeatureDict = itemFeature.collectAsMap()
itemFeature.take(5)


[(32768, [2405, 12362]),
 (2, [9004, 9857, 15230, 15634, 18904, 19606]),
 (32772, [8922, 12236, 12512, 16761]),
 (6, [5787, 11496, 12412, 13223, 13749, 15634, 18904]),
 (8, [1540, 4807, 5778, 12362, 15397, 15634, 18904])]

In [4]:
#Compute the feature-frequency
featureFreq = icm.map(lambda x: (x[1],1)).reduceByKey(lambda x, y: x + y)
featureFreqDict = featureFreq.collectAsMap()
prodCount= icm.map(lambda x: x[0]).distinct().count()
print("ProdCount: ",prodCount)

featureIdf = featureFreq.map(lambda x: (x[0],np.log10(prodCount/x[1])))
featureIdfDict = featureIdf.collectAsMap()
featureIdf.takeOrdered(10,lambda x: -x[1])


ProdCount:  36797


[(12691, 4.5658124127888637),
 (16384, 4.2647824171248825),
 (2, 4.2647824171248825),
 (19116, 4.2647824171248825),
 (12, 4.2647824171248825),
 (14, 4.2647824171248825),
 (16, 4.2647824171248825),
 (20, 4.2647824171248825),
 (24, 4.2647824171248825),
 (32, 4.2647824171248825)]

In [5]:
def computeTfForFeature(features):
    count = len(features)
    tfs = []
    for i in features:
        tfs.append((i, (1/np.sqrt(count))))
    return tfs
itemFeatureTf = itemFeature.map(lambda x: (x[0], computeTfForFeature(x[1])))
itemFeatureTfDict = itemFeatureTf.collectAsMap()
itemFeatureTf.count()

36797

In [12]:
trainSet = sc.textFile("../train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
trainSet = trainSet.map(lambda line: (int(line[0]), int(line[1]), int(line[2])))
trainSet.take(5)

[(2738, 1, 1), (4716, 1, 4), (13298, 1, 8), (15122, 1, 4), (11326, 2, 5)]

In [32]:
def profileItem(item, rating):
    score = []
    if item in itemFeatureTfDict.keys():
        features = itemFeatureTfDict[item]
        for f in features:
            score.append((f[0], f[1]*rating))
    return score


In [None]:
userProfileItem = trainSet.map(lambda x: (x[0], profileItem(x[1], x[2])))

#REDUCE FUNCTION

def aggUserFeatures(featuresX, featuresY):
    for elem in featuresX:
        for elem2 in featuresY:
            if elem[0]==elem2[0]:
                el=list(elem2)
                el[1]=el[1]+elem[0]
                elem2=tuple(el)
            else:
                featuresY.append(elem)
    return featuresY

userProfileItem=userProfileItem.reduceByKey(lambda x,y: aggUserFeatures(x,y))
userProfileItem.take(5)        