In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import numpy as np
from scipy.sparse import *
from collections import defaultdict
import pdb
from itertools import *
import operator
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [67]:
#lines = sc.parallelize([(1,1,7),(1,2,6),(1,3,7),(1,4,4),(1,5,5),(1,6,4),(2,1,6),(2,2,7),(2,4,4),(2,5,3),(2,6,4)\
 #                      ,(3,2,3),(3,3,3),(3,4,1),(3,5,1),(4,1,1),(4,2,2),(4,3,2),(4,4,3),(4,5,3),(4,6,4),(5,1,1)\
#                    ,(5,3,1),(5,4,2),(5,5,3),(5,6,3)])

def parseVectorOnUser(line):
    '''
    Parse each line of the specified data file, assuming a "|" delimiter.
    Key is user_id, converts each rating to a float.
    '''
    return line[0],(line[1],float(line[2]))

def parseVectorOnItem(line):
    '''
    Parse each line of the specified data file, assuming a "|" delimiter.
    Key is item_id, converts each rating to a float.
    '''
    return line[1],(line[0],float(line[2]))

def sampleInteractions(item_id,users_with_rating,n):
    '''
    For items with # interactions > n, replace their interaction history
    with a sample of n users_with_rating
    '''
    if len(users_with_rating) > n:
        return item_id, random.sample(users_with_rating,n)
    else:
        return item_id, users_with_rating
    
def findUserPairs(item_id,users_with_rating):
    '''
    For each item, find all user-user pairs combos. (i.e. users with the same item) 
    '''
    l = []
    for user1,user2 in permutations(users_with_rating,2):
        l.append(((user1[0],user2[0]),(user1[1],user2[1])))
    return l

def keyOnFirstUser(user_pair,item_sim_data):
    '''
    For each user-user pair, make the first user's id the key
    '''
    (user1_id,user2_id) = user_pair
    return user1_id,(user2_id,item_sim_data)

def calcSim(user_pair,rating_pairs, shrink, similarity="cosine"):
    ''' 
    For each user-user pair, return the specified similarity measure,
    along with co_raters_count.
    '''
    sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
    
    if(similarity=="cosine"):
        for rating_pair in rating_pairs:
            sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])
            sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])
            sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])
            # sum_y += rt[1]
            # sum_x += rt[0]
            n += 1

        cos_sim = cosine(sum_xy,np.sqrt(sum_xx),np.sqrt(sum_yy), shrink)
        return user_pair, (cos_sim,n)
    if(similarity=="jaccard"):
        for rating_pair in rating_pairs:
            sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])
            sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])
            sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])
            # sum_y += rt[1]
            # sum_x += rt[0]
            n += 1
        jac_sim = jaccard(sum_xy, sum_xx, sum_yy, shrink)
        return user_pair, (jac_sim,n)

def cosine(dot_product,rating_norm_squared,rating2_norm_squared, shrink):
    '''
    The cosine between two vectors A, B
       dotProduct(A, B) / (norm(A) * norm(B))
    '''
    numerator = dot_product
    denominator = rating_norm_squared * rating2_norm_squared + shrink

    return (numerator / (float(denominator))) if denominator else 0.0

def jaccard(dot_product, den1, den2, shrink):
    '''
    The jaccard similarity between two vectors A, B
       dotProduct(A, B) / (dotProduct(A, A^t) + dotProduct(B, B^t) - dotProduct(A, B)- 
    '''
    numerator = dot_product
    denominator = den1 + den2 - dot_product + shrink

    return (numerator / (float(denominator))) if denominator else 0.0

def nearestNeighbors(user,users_and_sims,n):
    '''
    Sort the predictions list by similarity and select the top-N neighbors
    '''
    
    users_and_sims.sort(key=lambda x: x[1][0],reverse=True)
    return user, users_and_sims[:n]

def topNRecommendations(user_id,user_sims,users_with_rating,seenDict,n, shrink):
    '''
    Calculate the top-N item recommendations for each user using the 
    weighted sums method
    '''

    # initialize dicts to store the score of each individual item,
    # since an item can exist in more than one item neighborhood
    totals = defaultdict(int)
    sim_sums = defaultdict(int)

    for (neighbor,(sim,count)) in user_sims:
        if sim > 0:
            # lookup the item predictions for this neighbor
            unscored_items = users_with_rating.get(neighbor,None)

            if unscored_items:
                for (item,rating) in unscored_items:
                    if item not in seenDict[user_id]:

                        # update totals and sim_sums with the rating data
                        totals[item] += sim * rating
                        sim_sums[item] += sim

    # create the normalized list of scored items 
    scored_items = [(np.floor(total/(sim_sums[item]+shrink)),item) for item,total in totals.items()]

    # take out the item score
    ranked_items = [(x[1],x[0]) for x in scored_items if (x[0]<=2 or x[0]>=8)]   #if x[0]<2 or x[0]>8

    return user_id,ranked_items[:n]

In [11]:
#ICM
icm = sc.textFile("../icm.csv")
icm = icm.map(lambda l: l.split(','))\
            .filter(lambda line: line[0] != 'itemId')\
            .map(lambda x: (int(x[0]), int(x[1]), 1))

In [13]:
#TRAIN SET
trainSet = sc.textFile("../train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
trainSet = trainSet.map(lambda line: (int(line[0]), int(line[1]), int(line[2])))

meanVotePerUser = trainSet.map(lambda x: (x[0], (x[2], 1)))\
                        .reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))\
                        .map(lambda x: (x[0], x[1][0]/x[1][1]))
meanVotePerUserDict = meanVotePerUser.collectAsMap()
lines = trainSet.map(lambda x: (x[0], x[1], x[2] - meanVotePerUserDict[x[0]]))
lines.take(5)

[(2738, 1, -5.666666666666667),
 (4716, 1, -2.9283333333333337),
 (13298, 1, 0.9902439024390244),
 (15122, 1, -0.829787234042553),
 (11326, 2, -0.833333333333333)]

In [14]:
item_user_pairs = lines.map(parseVectorOnItem).groupByKey().cache()

In [15]:
pairwise_users = item_user_pairs.filter(
        lambda p: len(p[1]) > 1).map(
        lambda p: findUserPairs(p[0],p[1])).flatMap(lambda x: x).groupByKey()
pairwise_users.take(5)

[((4138, 10206), <pyspark.resultiterable.ResultIterable at 0x2a334598080>),
 ((5171, 5617), <pyspark.resultiterable.ResultIterable at 0x2a3345982e8>),
 ((8392, 12596), <pyspark.resultiterable.ResultIterable at 0x2a334598390>),
 ((13008, 9078), <pyspark.resultiterable.ResultIterable at 0x2a334598320>),
 ((11392, 5514), <pyspark.resultiterable.ResultIterable at 0x2a334598400>)]

In [52]:
user_sims = pairwise_users.map(lambda p: calcSim(p[0],p[1],7,'cosine'))\
                        .map(lambda p: keyOnFirstUser(p[0],p[1])).groupByKey()\
                        .map(lambda x : (x[0], list(x[1])))\
                        .map(lambda p: nearestNeighbors(p[0],p[1],50))
user_sims.take(5)

[(2,
  [(7586, (0.0, 1)),
   (9278, (0.0, 1)),
   (4806, (0.0, 1)),
   (6590, (0.0, 1)),
   (842, (0.0, 1)),
   (2436, (0.0, 1)),
   (6258, (0.0, 1)),
   (6438, (0.0, 1)),
   (8406, (0.0, 1)),
   (520, (0.0, 1)),
   (7040, (0.0, 1)),
   (8228, (0.0, 1)),
   (1710, (0.0, 1)),
   (1793, (0.0, 1)),
   (2107, (0.0, 1)),
   (1481, (0.0, 1)),
   (10651, (0.0, 1)),
   (9195, (0.0, 1)),
   (9243, (0.0, 1)),
   (13857, (0.0, 1)),
   (3169, (0.0, 1)),
   (9177, (0.0, 1)),
   (10165, (0.0, 1)),
   (11501, (0.0, 1)),
   (5321, (0.0, 1)),
   (3809, (0.0, 1)),
   (4813, (0.0, 1))]),
 (4,
  [(6438, (0.0, 1)),
   (8544, (0.0, 1)),
   (7232, (0.0, 1)),
   (12366, (0.0, 1)),
   (2124, (0.0, 1)),
   (2854, (0.0, 1)),
   (1426, (0.0, 1)),
   (5682, (0.0, 1)),
   (11708, (0.0, 1)),
   (8296, (0.0, 1)),
   (924, (0.0, 1)),
   (1726, (0.0, 1)),
   (2696, (0.0, 1)),
   (3576, (0.0, 1)),
   (1242, (0.0, 1)),
   (11958, (0.0, 1)),
   (2468, (0.0, 1)),
   (9996, (0.0, 1)),
   (206, (0.0, 1)),
   (10000, (0.0, 1)

In [53]:
l1=user_sims.map(lambda x: x[0]).collect()
l2=user_sims.filter(lambda x: x[1][0][1][0]==0.0).map(lambda x: x[0]).collect()
userPipelining=[i for i in l1 if i not in l2]
userForPipelining = user_sims.filter(lambda x: x[0] in userPipelining)
userForPipelining.count()

11600

In [68]:
user_item_hist = trainSet.map(parseVectorOnUser).groupByKey().collect()
ui_dict = {}
for (user,items) in user_item_hist: 
    ui_dict[user] = items

uib = sc.broadcast(ui_dict)
seenItemsDict = lines.map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x,y: x+y).collectAsMap()
'''
Calculate the top-N item recommendations for each user
    user_id -> [item1,item2,item3,...]
'''

n=20
user_item_recs_pipelining = userForPipelining.map(lambda p: topNRecommendations(p[0],p[1],uib.value,seenItemsDict,n,0))
user_item_recs_pipelining.take(1000)

[(8,
  [(5, 1.0),
   (32777, 9.0),
   (13655, 8.0),
   (16396, 8.0),
   (16402, 8.0),
   (12003, 9.0),
   (4, 10.0),
   (29, 10.0),
   (32799, 8.0),
   (16420, 1.0),
   (16421, 9.0),
   (39, 9.0),
   (6548, 8.0),
   (32812, 1.0),
   (19642, 8.0),
   (21168, 2.0),
   (16451, 9.0),
   (13665, 8.0),
   (32848, 2.0),
   (32849, 8.0)]),
 (10,
  [(3076, 10.0),
   (29707, 10.0),
   (21517, 8.0),
   (14351, 8.0),
   (5138, 2.0),
   (24596, 8.0),
   (22552, 8.0),
   (12292, 2.0),
   (29723, 9.0),
   (12318, 8.0),
   (16421, 10.0),
   (9257, 8.0),
   (20659, 10.0),
   (3140, 8.0),
   (6215, 2.0),
   (75, 9.0),
   (29777, 9.0),
   (33875, 10.0),
   (28761, 9.0),
   (30812, 10.0)]),
 (12,
  [(15374, 8.0),
   (30125, 10.0),
   (34715, 9.0),
   (16419, 9.0),
   (7720, 8.0),
   (15402, 8.0),
   (31275, 8.0),
   (15918, 8.0),
   (33163, 8.0),
   (4787, 8.0),
   (24109, 8.0),
   (32949, 8.0),
   (16451, 8.0),
   (1974, 9.0),
   (34441, 8.0),
   (75, 8.0),
   (505, 10.0),
   (27220, 8.0),
   (12886, 8.0

In [69]:
newInteractions = user_item_recs_pipelining.collect()

In [72]:
f=open("newInteractions.csv",'w')
i=0
for (user, prods) in newInteractions:
    for (prod, rating) in prods:
        f.write("{},{},{}\n".format(user, prod, int(rating)))
        i=i+1
print(i)
f.close()

230048
