In [1]:
import re
import sys
import time
from pyspark import SparkConf, SparkContext

conf = SparkConf()
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [2]:
# Any pair of users from 'friends' has 'user' as their common friend
def build_pairs(line):
    user, friends = line[0], sorted(line[1].split(","))
    positive_keys = [(friends[i], j) for i in range(len(friends)) for j in friends[i + 1:]]
    positive_pairs = map(lambda key : (key, 1), positive_keys)
    return list(positive_pairs)

def top_10(user):
    candidates = user[1]
    sorted_candidates = list(sorted(candidates, key=lambda pair : (-pair[1], int(pair[0]))))
    top_10 = sorted_candidates[:min(10, len(candidates))]
    return (user[0], [rec for (rec, count) in top_10])

def build_pairs(line):
    # Because the graph is undirected, it does not matter which ID comes first.
    # Sort ID so that the reverse order is treated as the same edge.
    # No need to transform to integer, though strings are sorted differently than numbers.
    user, friends = line[0], sorted(line[1].split(","))
    
    # Any pair of users from 'friends' has 'user' as their common friend
    positive_keys = [(friends[i], j) for i in range(len(friends) - 1) for j in friends[i + 1:]]
    positive_pairs = map(lambda key : (key, 1), positive_keys)
    
    # 'user' is already freind with his friends, so it does not matter how many common friends
    # they share, they should not be recommended to each other
    negative_keys = [(user, friend) if user < friend else (friend, user) for friend in friends]
    negative_pairs = map(lambda key : (key, -2**32), negative_keys)
    
    return list(positive_pairs) + list(negative_pairs)

In [3]:
time_start = time.time()

doc = sc.textFile("data/soc-LiveJournal1Adj.txt")
lines = doc.map(lambda line : line.split('\t'))

pairs = lines.flatMap(build_pairs)

count = pairs.reduceByKey(lambda n1, n2: (n1 + n2)).filter(lambda x : x[1] > 0)

# map ((user1, user2), n) to (user1, (user2, n)), (user2, (user1, n))
two_way_count = count.flatMap(lambda x : ((x[0][0], (x[0][1], x[1])), (x[0][1], (x[0][0], x[1]))))
# users = count.flatMap(lambda (pair, count) : [(pair[0], (pair[1], count), (pair[1], (pair[0], count)))])

# group for each user: (user, [(user2, n2), (user3, n3) ...])
count_by_user = two_way_count.groupByKey()

# for each user, sort non-friend users by common friends
recommendations = count_by_user.map(top_10)

check = [str(user_id) for user_id in [924, 8941, 8942, 9019, 9020, 9021, 9022, 9990, 9992, 9993]]
result = recommendations.filter(lambda rec : rec[0] in check).collect()
result.sort(key=lambda x : int(x[0]))

with open('output.txt', 'w') as file:
    for line in result:
        line = "%s\t%s\n"%(line[0], ",".join(line[1]))
        print(line, end="")
        file.write(line)
        
time_end = time.time()
print("processing completed, time elapsed: %.2fs\n"%(time_end - time_start))

924	439,2409,6995,11860,15416,43748,45881
8941	8943,8944,8940
8942	8939,8940,8943,8944
9019	9022,317,9023
9020	9021,9016,9017,9022,317,9023
9021	9020,9016,9017,9022,317,9023
9022	9019,9020,9021,317,9016,9017,9023
9990	13134,13478,13877,34299,34485,34642,37941
9992	9987,9989,35667,9991
9993	9991,13134,13478,13877,34299,34485,34642,37941
processing completed, time elapsed: 241.91s



In [4]:
sc.stop()