In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import numpy as np
from pyspark.mllib.linalg.distributed import *
from pyspark.mllib.linalg import *
from scipy.sparse import *
from collections import defaultdict
from itertools import *
from sklearn.metrics import pairwise_distances
import tqdm

In [2]:
conf = SparkConf().setAppName("App")
conf.set("spark.driver.memory", '20g')
#conf = (conf.setMaster('local[*]')
        #.set('spark.driver.memory', '4G')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
icm = sc.textFile("../icm.csv")
icm = icm.map(lambda l: l.split(','))\
            .filter(lambda line: line[0] != 'itemId')\
            .map(lambda x: (int(x[0]), int(x[1]), 1))
trainSet = sc.textFile("../train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
trainSet = trainSet.map(lambda line: (int(line[0]), int(line[1]), int(line[2])))
#FOR THE TOP POP
itemSet = trainSet.map(lambda x: (x[1], x[2]))
itemsCount = trainSet.map(lambda x: (x[1],1)).reduceByKey(lambda x,y : x + y)
itemsCount_dict = itemsCount.collectAsMap()
#-----------------------------------------
featureFreq = icm.map(lambda x: (x[1],1)).reduceByKey(lambda x, y: x + y)
featureFreqDict = featureFreq.collectAsMap()
prodCount= icm.map(lambda x: x[0]).distinct().count()
featureIdf = featureFreq.map(lambda x: (x[0],np.log10(prodCount/x[1])))
featureIdfDict = featureIdf.collectAsMap()

targetUsers = sc.textFile("../target_user.csv").filter(lambda x: "userId" not in x).map(lambda x: int(x))
targets=targetUsers.collect()

norms = icm.map(lambda x: (x[0],1))\
                .reduceByKey(lambda x, y: x+y).mapValues(lambda x: np.sqrt(x))\
                .collectAsMap()

normalized = icm.map(lambda x: (x[0], x[1], x[2]/norms[x[0]]))

In [4]:
print(trainSet.count())
data = trainSet.map(lambda x: x[2]).collect()
rows = trainSet.map(lambda x: x[0]).collect()
cols = trainSet.map(lambda x: x[1]).collect()
data.append(0)
rows.append(15364)
cols.append(37142)
userItem=csr_matrix((data,(rows,cols)))
print("userItem shape:",userItem.shape)
data = normalized.map(lambda x: x[2]).collect()
rows = normalized.map(lambda x: x[0]).collect()
cols = normalized.map(lambda x: x[1]).collect()
data.append(0)
rows.append(37142)
cols.append(80)
itemFeature = csc_matrix((data,(rows,cols)))
print("itemFeat shape:",itemFeature.shape)

170149
userItem shape: (15375, 37143)
itemFeat shape: (37143, 19716)


In [5]:
numberOfRecommendations=5
#TOP POPULAR
cost=8
avgRatings=itemSet.reduceByKey(lambda x,y: x+y)
avgRatings=avgRatings.map(lambda x: (x[0],x[1]/(itemsCount_dict[x[0]]+cost)))
avgRatings.take(5)
itemOrderByPop=avgRatings.sortBy(lambda x: x[1], ascending=False)
itemPop = np.array(itemOrderByPop.map(lambda x: x[0]).collect())
seenItems= trainSet.map(lambda x: (x[0],[x[1]])).reduceByKey(lambda x,y: x + y)
seenItemsDict = defaultdict(int)
seenItemsDict = seenItems.collectAsMap()
#--------------------------------------------------------------

In [6]:
userFeature = userItem.dot(itemFeature)
userFeature.shape

(15375, 19716)

In [7]:
data = []
rows = []
cols = []
for f in featureIdfDict.keys():
    data.append(featureIdfDict[f])
    cols.append(f)
    rows.append(f)
featureIdf = csr_matrix((data,(rows,cols)))
featureIdf.shape        

(19716, 19716)

In [8]:
userProfile = userFeature.dot(featureIdf)
userProfile

<15375x19716 sparse matrix of type '<class 'numpy.float64'>'
	with 653062 stored elements in Compressed Sparse Row format>

In [9]:
userSimilarities = 1 - pairwise_distances(userProfile, metric='cosine')

In [10]:
type(userSimilarities)

numpy.ndarray

In [11]:
np.fill_diagonal(userSimilarities,0)

In [12]:
k_value = 50
userList = trainSet.map(lambda x: x[0]).distinct().collect()
numUser = len(userList)

In [13]:
values, rows, cols = [], [], []
for user in userList:
    currentUserSimilarity = userSimilarities[user][:]
    top_k_idx = np.argsort(currentUserSimilarity)[-k_value:]
    values.extend(currentUserSimilarity[top_k_idx])
    rows.extend(np.ones(k_value) * user)
    cols.extend(top_k_idx)

In [14]:
similarity_topK = csr_matrix((values, (rows, cols)), dtype=np.float32)

In [15]:
similarity_topK = similarity_topK + similarity_topK.transpose()

In [16]:
print("Sim top k shape: ", similarity_topK.shape)
print("userItem: ", userItem.shape)
print("ItemUser: ", userItem.transpose().shape)

Sim top k shape:  (15375, 15375)
userItem:  (15375, 37143)
ItemUser:  (37143, 15375)


In [17]:
predRatings = userItem.transpose()
print(type(userItem.transpose()))
print(type(similarity_topK.transpose()))

<class 'scipy.sparse.csc.csc_matrix'>
<class 'scipy.sparse.csc.csc_matrix'>


In [18]:
predRatings = userItem.transpose().dot(similarity_topK).transpose()

In [19]:
predRatings.shape

(15375, 37143)

In [20]:
its = np.argsort(predRatings[4][:].toarray()[0])[-10:][::-1]
print(its)
print("Ratings: ", predRatings[4][:].toarray()[0][its])

[25656 23217 32578 17380 28102 32117 18741 15378 31731 34271]
Ratings:  [ 113.12700625   50.49530581   28.56198211   11.07715186   10.83322167
    9.91448776    9.37620401    8.47393513    8.4468416     8.08896518]


In [21]:
#CONTENT BASED
prediction = userProfile.dot(itemFeature.transpose())
prediction.shape

(15375, 37143)

In [22]:
data = []
cols = np.array(range(0,prediction.shape[0]))
rows = np.array(range(0,prediction.shape[0]))
for u in range(0,prediction.shape[0]):
    row = prediction.getrow(u).toarray()
    maxUser = row.max()
    if maxUser!=0:
        data.append(9/maxUser)
    else:
        data.append(maxUser)
print(len(data))
diagonalCont = csr_matrix((data,(rows,cols)))
diagonalCont.shape

15375


(15375, 15375)

In [23]:
norm = prediction.transpose().dot(diagonalCont)
matrixCont = norm.transpose()
matrixCont.shape

(15375, 37143)

In [24]:
data = []
cols = np.array(range(0,prediction.shape[0]))
rows = np.array(range(0,prediction.shape[0]))
for u in range(0,prediction.shape[0]):
    row = predRatings.getrow(u).toarray()
    maxUser = row.max()
    if maxUser!=0:
        data.append(9/maxUser)
    else:
        data.append(maxUser)
print(len(data))
diagonalColl = csr_matrix((data,(rows,cols)))
diagonalColl.shape

15375


(15375, 15375)

In [25]:
norm = predRatings.transpose().dot(diagonalColl)
matrixColl = norm.transpose()
matrixColl.shape

(15375, 37143)

In [42]:
finalMatrix = matrixColl.multiply(0.30) + matrixCont.multiply(0.70)
finalMatrix.shape

(15375, 37143)

In [43]:
#Time to generate the predictions!
f=open("predictionsCollViaCon&Content3070.csv",'w')
numberOfRecommendations = 5
f.write("userId,RecommendedItemIds\n")
i=0
for user in targets:
    currentItems = finalMatrix[user][:].toarray()[0]
    reccomended = np.argsort(currentItems)#[-numberOfRecommendations:][::-1]
    unseenMask = np.in1d(reccomended, seenItemsDict.get(user,[]), invert = True)
    reccomended = reccomended[unseenMask][-numberOfRecommendations:][::-1]
    print(reccomended)
    f.write(str(user)+',')
    for prod in reccomended:
        f.write(str(prod)+' ')
    f.write('\n')
    i = i + 1
    print(i,"of", len(targets), "written")
f.close()

[32578 35061    98 30408 14572]
1 of 4196 written
[ 2762    10 11472 24699  6321]
2 of 4196 written
[ 8134 19171 23663 33443 30279]
3 of 4196 written
[30287 26335 30445 34390 16803]
4 of 4196 written
[15469 23137  7130 15016  6081]
5 of 4196 written
[18741  1448 33215 17644 24547]
6 of 4196 written
[23786  3465 33944 20374 20182]
7 of 4196 written
[33562 24488  4735  6821 22389]
8 of 4196 written
[13069 23833 24256 26332 28592]
9 of 4196 written
[24217 21284  1041 31496 19215]
10 of 4196 written
[19798 17126 10963 18803 14820]
11 of 4196 written
[25987 17924 28719 15112 22814]
12 of 4196 written
[28346 10620  6539  3665 28738]
13 of 4196 written
[12866  2911 36350 35300 30071]
14 of 4196 written
[26550 11465 34168  9793 27731]
15 of 4196 written
[20941 15973 23336 24547 11752]
16 of 4196 written
[11557 18717 37101 28294 31800]
17 of 4196 written
[29833 25736 17925 23282  4708]
18 of 4196 written
[14841 19977 12530 29408 24299]
19 of 4196 written
[13782  7926   228 23338 15979]
20 of 41