# Homework 2

Submit your *.ipynb through Gradescope by downloading: `File` ⇒ `Download` ⇒ `Download .ipynb`, and then submit with your PDF via link to your repository.

### Setup

In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=7deaa57dc09cb45099361382ec362f0ce9340b6960820a23688f4bd17c217eb2
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic

In [2]:
#@title Import PySpark and create SparkContext

import itertools
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [3]:
#@title Download the data from the course website
!wget https://course.ccs.neu.edu/cs6220/fall2023/homework-3/soc-LiveJournal1Adj.txt

--2024-02-07 23:21:52--  https://course.ccs.neu.edu/cs6220/fall2023/homework-3/soc-LiveJournal1Adj.txt
Resolving course.ccs.neu.edu (course.ccs.neu.edu)... 129.10.117.35
Connecting to course.ccs.neu.edu (course.ccs.neu.edu)|129.10.117.35|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4156181 (4.0M) [text/plain]
Saving to: ‘soc-LiveJournal1Adj.txt’


2024-02-07 23:21:53 (21.6 MB/s) - ‘soc-LiveJournal1Adj.txt’ saved [4156181/4156181]



### Load the data in!

In [4]:
# Read the data in
lines = sc.textFile("soc-LiveJournal1Adj.txt", 1)
lines = lines.map(lambda line: line.split())

In [9]:
lines.take(2)

[['0',
  '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94'],
 ['1',
  '0,5,20,135,2409,8715,8932,10623,12347,12846,13840,13845,14005,20075,21556,22939,23520,28193,29724,29791,29826,30691,31232,31435,32317,32489,34394,35589,35605,35606,35613,35633,35648,35678,38737,43447,44846,44887,49226,49985,623,629,4999,6156,13912,14248,15190,17636,19217,20074,27536,29481,29726,29767,30257,33060,34250,34280,34392,34406,34418,34420,34439,34450,34651,45054,49592']]

Reasoning Process

1. parse the social network data to create an RDD of users and

their direct friends.

2. for each user, identifie potential friends by excluding direct connections and calculating the number of mutual friends.

3. rank these potential connections based on the count of mutual friends, and for each user, we output the top N recommendations

In [5]:
# Parse Friends
def safe_parse(line):
    try:
        if len(line) == 2 and line[1].strip():
            user_id = int(line[0])
            friends_list = [int(friend) for friend in line[1].split(',') if friend.strip()]
            return (user_id, friends_list)
        else:
            return (int(line[0]), [])
    except ValueError:
        print(f"Error parsing line: {line}")
        return None


# Apply the parsing function
user_friends = lines.map(safe_parse)



In [6]:
def generate_recommendations(user_id, user_friends, N=10):
    """
    Generate recommendations for a given user based on the number of mutual friends.

    Parameters:
    - user_id (int): The ID of the user for whom to generate recommendations.
    - user_friends (RDD): An RDD of (user_id, [friends_list]) tuples.
    - N (int): The number of recommendations to generate.

    Returns:
    - A list of user IDs representing the algorithm’s recommendation of people that the user might know,
      ordered by decreasing number of mutual friends.
    """

    # Extract the list of direct friends for the given user
    direct_friends = user_friends.filter(lambda x: x[0] == user_id).flatMap(lambda x: x[1]).collect()

    # Generate potential recommendations
    potential_recs = user_friends \
        .filter(lambda x: x[0] != user_id) \
        .flatMap(lambda x: [(friend, {x[0]}) for friend in x[1] if friend not in direct_friends]) \
        .reduceByKey(lambda a, b: a | b) \
        .map(lambda x: (x[0], len(set(direct_friends) & x[1]))) \
        .filter(lambda x: x[1] > 0 and x[0] != user_id)  # Exclude the user's own ID from recommendations

    # Sort by number of mutual friends (descending) and then by user ID (ascending)
    top_recs = potential_recs.sortBy(lambda x: (-x[1], x[0])).map(lambda x: x[0]).take(N)

    return top_recs


In [7]:
# Sanity Check for UserID 11
user_id = 11
N = 10
recommendations = generate_recommendations(user_id, user_friends, N)

# Print the recommendations
print(f"Top {N} recommendations for User ID {user_id}:")
for rec in recommendations:
    print(f"User ID: {rec}")

Top 10 recommendations for User ID 11:
User ID: 27552
User ID: 7785
User ID: 27573
User ID: 27574
User ID: 27589
User ID: 27590
User ID: 27600
User ID: 27617
User ID: 27620
User ID: 27667


Output for users_id [924, 8941, 8942, 9019, 9020, 9021, 9022, 9990, 9992, 9993]

In [8]:
# List of user IDs for which to generate recommendations
user_ids = [924, 8941, 8942, 9019, 9020, 9021, 9022, 9990, 9992, 9993]
N = 10

# Open the file 'output.txt' in write mode
with open('output.txt', 'w') as file:
    for user_id in user_ids:
        # Generate recommendations for the current user ID
        recommendations = generate_recommendations(user_id, user_friends, N)

        # Convert the list of recommendations to a comma-separated string
        recommendations_str = ','.join(map(str, recommendations))

        # Construct the output string in the required format
        output_line = f"{user_id}\t{recommendations_str}\n"

        # Write the output string to the file
        file.write(output_line)

# Print a message indicating completion
print("Recommendations for specified user IDs have been written to output.txt")


Recommendations for specified user IDs have been written to output.txt
