In [1]:
import sys
from collections import defaultdict
from itertools import combinations
import random
import numpy as np
import pdb

from pyspark import SparkContext
sc=SparkContext.getOrCreate()

In [202]:
#lines = sc.parallelize([(1,1,7),(1,2,6),(1,3,7),(1,4,4),(1,5,5),(1,6,4),(2,1,6),(2,2,7),(2,4,4),(2,5,3),(2,6,4)\
 #                      ,(3,2,3),(3,3,3),(3,4,1),(3,5,1),(4,1,1),(4,2,2),(4,3,2),(4,4,3),(4,5,3),(4,6,4),(5,1,1)\
  #                     ,(5,3,1),(5,4,2),(5,5,3),(5,6,3)])
trainSet = sc.textFile("../train.csv")
trainSet = trainSet.map(lambda l: l.split(','))
trainSet = trainSet.filter(lambda line: 'userId' not in line)
lines = trainSet.map(lambda line: (int(line[0]), int(line[1]), int(line[2])))
def parseVectorOnUser(line):
    '''
    Parse each line of the specified data file, assuming a "|" delimiter.
    Key is user_id, converts each rating to a float.
    '''
    return line[0],(line[1],float(line[2]))

def parseVectorOnItem(line):
    '''
    Parse each line of the specified data file, assuming a "|" delimiter.
    Key is item_id, converts each rating to a float.
    '''
    return line[1],(line[0],float(line[2]))

def sampleInteractions(item_id,users_with_rating,n):
    '''
    For items with # interactions > n, replace their interaction history
    with a sample of n users_with_rating
    '''
    if len(users_with_rating) > n:
        return item_id, random.sample(users_with_rating,n)
    else:
        return item_id, users_with_rating
    
def findUserPairs(item_id,users_with_rating):
    '''
    For each item, find all user-user pairs combos. (i.e. users with the same item) 
    '''
    l = []
    for user1,user2 in combinations(users_with_rating,2):
        l.append(((user1[0],user2[0]),(user1[1],user2[1])))
    return l

def keyOnFirstUser(user_pair,item_sim_data):
    '''
    For each user-user pair, make the first user's id the key
    '''
    (user1_id,user2_id) = user_pair
    return user1_id,(user2_id,item_sim_data)

def calcSim(user_pair,rating_pairs):
    ''' 
    For each user-user pair, return the specified similarity measure,
    along with co_raters_count.
    '''
    sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
    
    for rating_pair in rating_pairs:
        sum_xx += np.float(rating_pair[0]) * np.float(rating_pair[0])
        sum_yy += np.float(rating_pair[1]) * np.float(rating_pair[1])
        sum_xy += np.float(rating_pair[0]) * np.float(rating_pair[1])
        # sum_y += rt[1]
        # sum_x += rt[0]
        n += 1

    cos_sim = cosine(sum_xy,np.sqrt(sum_xx),np.sqrt(sum_yy))
    return user_pair, (cos_sim,n)

def cosine(dot_product,rating_norm_squared,rating2_norm_squared):
    '''
    The cosine between two vectors A, B
       dotProduct(A, B) / (norm(A) * norm(B))
    '''
    numerator = dot_product
    denominator = rating_norm_squared * rating2_norm_squared

    return (numerator / (float(denominator))) if denominator else 0.0

def nearestNeighbors(user,users_and_sims,n):
    '''
    Sort the predictions list by similarity and select the top-N neighbors
    '''
    
    users_and_sims.sort(key=lambda x: x[1][0],reverse=True)
    return user, users_and_sims[:n]

def topNRecommendations(user_id,user_sims,users_with_rating,n):
    '''
    Calculate the top-N item recommendations for each user using the 
    weighted sums method
    '''

    # initialize dicts to store the score of each individual item,
    # since an item can exist in more than one item neighborhood
    totals = defaultdict(int)
    sim_sums = defaultdict(int)

    for (neighbor,(sim,count)) in user_sims:

        # lookup the item predictions for this neighbor
        unscored_items = users_with_rating.get(neighbor,None)

        if unscored_items:
            for (item,rating) in unscored_items:
                if neighbor != item:

                    # update totals and sim_sums with the rating data
                    totals[neighbor] += sim * rating
                    sim_sums[neighbor] += sim

    # create the normalized list of scored items 
    scored_items = [(total/sim_sums[item],item) for item,total in totals.items()]

    # sort the scored items in ascending order
    scored_items.sort(reverse=True)

    # take out the item score
    ranked_items = [x[1] for x in scored_items]

    return user_id,ranked_items[:n]

In [203]:
item_user_pairs = lines.map(parseVectorOnItem).groupByKey()\
                    .map(lambda p: sampleInteractions(p[0],p[1],500)).cache()
print("Item: ",item_user_pairs.collect()[0][0], "- User Pairs: ", list(item_user_pairs.collect()[0][1]))

Item:  32768 - User Pairs:  [(7287, 4.0)]


In [204]:
pairwise_users = item_user_pairs.filter(
        lambda p: len(p[1]) > 1).map(
        lambda p: findUserPairs(p[0],p[1])).flatMap(lambda x: x).groupByKey()
print(list(pairwise_users.collect()[3][1]))
pairwise_users.collect()

[(5.0, 9.0)]


[((4314, 10584), <pyspark.resultiterable.ResultIterable at 0x7fccde0ed908>),
 ((5977, 8459), <pyspark.resultiterable.ResultIterable at 0x7fccde0ed208>),
 ((1375, 8487), <pyspark.resultiterable.ResultIterable at 0x7fccde0ed940>),
 ((5564, 11634), <pyspark.resultiterable.ResultIterable at 0x7fccde0edbe0>),
 ((8523, 11967), <pyspark.resultiterable.ResultIterable at 0x7fccde0ed630>),
 ((7947, 8867), <pyspark.resultiterable.ResultIterable at 0x7fccde0ede10>),
 ((5281, 12783), <pyspark.resultiterable.ResultIterable at 0x7fccde0edf60>),
 ((12851, 14601), <pyspark.resultiterable.ResultIterable at 0x7fccde0ed518>),
 ((9440, 9898), <pyspark.resultiterable.ResultIterable at 0x7fccde0edeb8>),
 ((10684, 14546), <pyspark.resultiterable.ResultIterable at 0x7fccde0ed7f0>),
 ((10876, 13248), <pyspark.resultiterable.ResultIterable at 0x7fccde0ed6a0>),
 ((7578, 14636), <pyspark.resultiterable.ResultIterable at 0x7fccde0edba8>),
 ((12259, 14723), <pyspark.resultiterable.ResultIterable at 0x7fccde0eda90>),

In [206]:
user_sims = pairwise_users.map(lambda p: calcSim(p[0],p[1]))\
                        .map(lambda p: keyOnFirstUser(p[0],p[1])).groupByKey()\
                        .map(lambda x : (x[0], list(x[1])))\
                        .map(lambda p: nearestNeighbors(p[0],p[1],50))
user_sims.collect()

[(2,
  [(6258, (1.0, 1)),
   (9278, (1.0, 1)),
   (8406, (1.0, 1)),
   (7586, (1.0, 1)),
   (1710, (1.0, 1)),
   (7040, (1.0, 1)),
   (842, (1.0, 1)),
   (8228, (1.0, 1)),
   (2436, (1.0, 1)),
   (6590, (1.0, 1)),
   (520, (1.0, 1)),
   (6438, (1.0, 1)),
   (4806, (1.0, 1)),
   (9195, (1.0, 1)),
   (3169, (1.0, 1)),
   (1481, (1.0, 1)),
   (9243, (1.0, 1)),
   (3809, (1.0, 1)),
   (13857, (1.0, 1)),
   (11501, (1.0, 1)),
   (1793, (1.0, 1)),
   (10651, (1.0, 1)),
   (4813, (1.0, 1)),
   (2107, (1.0, 1)),
   (9177, (1.0, 1)),
   (10165, (1.0, 1)),
   (5321, (1.0, 1))]),
 (4,
  [(7232, (1.0, 1)),
   (14632, (1.0, 1)),
   (924, (1.0, 1)),
   (9996, (1.0, 1)),
   (206, (1.0, 1)),
   (6438, (1.0, 1)),
   (2854, (1.0, 1)),
   (9846, (1.0, 1)),
   (8296, (1.0, 1)),
   (3502, (1.0, 1)),
   (12366, (1.0, 1)),
   (4414, (1.0, 1)),
   (1242, (1.0, 1)),
   (5682, (1.0, 1)),
   (2468, (1.0, 1)),
   (1726, (1.0, 1)),
   (12822, (1.0, 1)),
   (2696, (1.0, 1)),
   (11958, (1.0, 1)),
   (13338, (1.0, 1