Firstly, I parse the input data and change them into integer format.

Secondly, all pairs of direct friends will be generated. It will be used to removing all the direct friends when we form the recommendation.

Thridly, mutual friend pairs will be generated in each line. for example, for user 1, all its friends are mutual friends with each other. so the mutual friends pair will be generated bidirectional. Then, we count the number of mutual friend pairs.

Fourthly, we remove the direct friends from the mutual friends pairs.

Then, we sort by mutual friend count and by friend ID to generate the recommendations.


In [1]:
from pyspark import SparkContext
from itertools import combinations
from pyspark.sql import SparkSession
from pyspark import SparkConf

conf=SparkConf().set("spark.executor.memory", "8g")

# Stop any existing SparkContext
if 'sc' in globals():
    sc.stop()

# Initialize a new SparkContext
sc = SparkContext("local", "MutualFriendsRecommendation")

# Load the data
data = sc.textFile("/content/soc-LiveJournal1Adj.txt")




# Step 1: Parse the data into (user, friends) pairs
def parse_line(line):
    parts = line.split("\t")  # Split by tab
    # check if line format is correct before converting to int
    if len(parts) == 2 and parts[0] and parts[1]:
        try:
            user = int(parts[0])  # Convert user ID to integer
            friends = list(map(int, parts[1].split(",")))  # Convert friend IDs to integers
            return (user, friends)
        except ValueError:
            # If any conversion fails, skip the line and return None
            return None
    else:
        # Handle case where line is empty or has wrong format by returning None
        return None
# Apply the function using map and filter out None values
user_friends = data.map(parse_line).filter(lambda x: x is not None)


# check: Print sample of parsed data
print(user_friends.take(5))

[(0, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94]), (1, [0, 5, 20, 135, 2409, 8715, 8932, 10623, 12347, 12846, 13840, 13845, 14005, 20075, 21556, 22939, 23520, 28193, 29724, 29791, 29826, 30691, 31232, 31435, 32317, 32489, 34394, 35589, 35605, 35606, 35613, 35633, 35648, 35678, 38737, 43447, 44846, 44887, 49226, 49985, 623, 629, 4999, 6156, 13912, 14248, 15190, 17636, 19217, 20074, 27536, 29481, 29726, 29767, 30257, 33060, 34250, 34280, 34392, 34406, 34418, 34420, 34439, 34450, 34651, 45054, 49592]), (2, [0, 117, 135, 1220, 2755, 12453, 24539, 24714, 41456, 45046, 49927, 6893, 13795, 16659, 32828, 41878]), (3, [0, 12, 41, 55, 1532, 12636, 13185, 27552, 38737]), (4, [0,

In [2]:
# Step 2: Generate all (user, friend) pairs for direct friends
def direct_pairs(user, friends):
    direct_friends = []
    for friend in friends:
        direct_friends.append((user, friend))  # (user, direct friend)
        direct_friends.append((friend, user))
    return direct_friends

friend_pairs = user_friends.flatMap(lambda x: direct_pairs(x[0], x[1]))

# Debugging: Print sample of (user, friend) pairs
print("Step 2: (user, friend) direct friends pairs")
print(friend_pairs.take(5))


Step 2: (user, friend) direct friends pairs
[(0, 1), (1, 0), (0, 2), (2, 0), (0, 3)]


In [3]:
# Step 3: Generate mutual friends pairs
def generate_mutual_friends(user, friends):
    return [((friends[i], friends[j]), user)
            for i in range(len(friends))
            for j in range(i + 1, len(friends))] + \
           [((friends[j], friends[i]), user)
            for i in range(len(friends))
            for j in range(i + 1, len(friends))]

mutual_friends = user_friends.flatMap(lambda x: generate_mutual_friends(x[0], x[1]))

# Check: mutual friends pairs sample
print(mutual_friends.take(5))

# Count mutual friends pairs
mutual_friend_counts = mutual_friends.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y)

[((1, 2), 0), ((1, 3), 0), ((1, 4), 0), ((1, 5), 0), ((1, 6), 0)]


In [4]:
# Step 4: remove direct friends

direct_friends = friend_pairs.map(lambda x: (x, 0))
filtered_recommendations = mutual_friend_counts.subtractByKey(direct_friends)


# check: Print sample of user pairs
# print(filtered_recommendations.take(5))


In [5]:
# Step 5: transform the pair into a (user, (friend, mutual_friend_count)) structure

formated_recommendations = filtered_recommendations.map(lambda x: (x[0][0], (x[0][1], x[1])))

# Debugging: Print sample of mutual friends
# print("Step 5: Mutual Friends Sample")
#print(formated_recommendations.take(5))


In [6]:
# Step 6: sort and extract top 10 recommendations
def sort_and_extract_top_10(recs):
    # Sort by mutual friend count (descending) and then by friend ID (ascending)
    sorted_recs = sorted(recs, key=lambda x: (-x[1], x[0]))
    return sorted_recs[:10]  # Return the top 10 recommendations

# Apply the function using mapValues
sorted_recommendations = formated_recommendations.groupByKey() \
    .mapValues(sort_and_extract_top_10)

In [7]:
# Step 7: Filter recommendations for specific user IDs
target_users = [1376, 1377, 1210, 9018, 9033, 9040, 9999, 9910, 9902, 9993,11]
recommendations_for_check = sorted_recommendations.filter(lambda x: x[0] in target_users).collect()


In [8]:
# Step 8: Collect and print the results
for user, recs in recommendations_for_check:
    friend_list = []
    for friend, _ in recs:
        friend_list.append(str(friend))
    print(f"User {user}: {', '.join(friend_list)}")

# Stop the SparkContext
sc.stop()

User 1210: 1158, 1204, 1233, 1169, 1223, 1159, 1167, 1175, 1182, 1183
User 1376: 1382, 6487, 1356, 1366, 2409, 4498, 5307, 8737, 12700, 16486
User 9910: 351, 622, 2554, 7651, 9920, 22338, 30169, 30403, 45111, 47577
User 9018: 9016, 9017, 317, 9023
User 9902: 9906, 18845, 1797, 9891, 9894, 18626, 24136, 236, 319, 351
User 9040: 9033, 9039, 9025, 9027, 9031, 9032, 9034, 9036, 503, 9029
User 9033: 9039, 9040, 9025, 9026, 9027, 9031, 9032, 9034, 9036, 503
User 9993: 9991, 13134, 13478, 13877, 34299, 34485, 34642, 37941
User 1377: 1349, 1382, 1387, 1390, 1405, 1410, 1428, 33772, 1352, 1354
User 9999: 36764, 44132, 10058, 44088, 36765, 36909, 10055, 44068, 44076, 10000
User 11: 27552, 7785, 27573, 27574, 27589, 27590, 27600, 27617, 27620, 27667
