<a href="https://colab.research.google.com/github/sridhartroy/AIML/blob/main/MapReduce_FriendRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark==3.5.6

Collecting pyspark==3.5.6
  Downloading pyspark-3.5.6.tar.gz (317.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.4/317.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.6-py2.py3-none-any.whl size=317895798 sha256=f5426a2c9ce7578bb47deec121fe18d4ece9204a5688ded0813d6ea98af5301e
  Stored in directory: /root/.cache/pip/wheels/64/62/f3/ec15656ea4ada0523cae62a1827fe7beb55d3c8c87174aad4a
Successfully built pyspark
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.1
    Uninstalling pyspark-3.5.1:
      Successfully uninstalled pyspark-3.5.1
Successfully installed pyspark-3.5.6


In [2]:
from pyspark.sql import SparkSession
# Initialize Spark session
spark = SparkSession.builder \
    .appName("MapReduce-FriendRecommender") \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
!wget https://an-ml.s3.us-west-1.amazonaws.com/soc-LiveJournal1Adj.txt

--2025-10-19 18:21:31--  https://an-ml.s3.us-west-1.amazonaws.com/soc-LiveJournal1Adj.txt
Resolving an-ml.s3.us-west-1.amazonaws.com (an-ml.s3.us-west-1.amazonaws.com)... 16.15.4.218, 16.15.4.168, 16.15.0.105, ...
Connecting to an-ml.s3.us-west-1.amazonaws.com (an-ml.s3.us-west-1.amazonaws.com)|16.15.4.218|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4156181 (4.0M) [text/plain]
Saving to: ‘soc-LiveJournal1Adj.txt’


2025-10-19 18:21:32 (8.20 MB/s) - ‘soc-LiveJournal1Adj.txt’ saved [4156181/4156181]



In [4]:
friends = sc.textFile("soc-LiveJournal1Adj.txt")

In [5]:
# Tiny example

lines = sc.parallelize([
    "1\t2,3,4,5",
    "2\t1,3,5,6",
    "3\t1,2,4,6",
    "4\t1,3,5",
    "5\t,1,2,4,6",
  ])


In [6]:
lines.take(3)

['1\t2,3,4,5', '2\t1,3,5,6', '3\t1,2,4,6']

In [7]:
# parse the file

def parse_line(line):
    parts = line.split("\t")
    #print(len(parts))
    user = parts[0]
    friends = parts[1].split(",") if len(parts) > 1 and parts[1] else []
    friends = [f for f in friends if f and f != user]  # drop empties/self
    return (user, friends)

In [8]:
print("STEP 0 - Original List:")
print(lines.collect())

adj = lines.map(parse_line).cache()
print("STEP 1 — parsed adjacency:")
for x in adj.collect(): print(x)

STEP 0 - Original List:
['1\t2,3,4,5', '2\t1,3,5,6', '3\t1,2,4,6', '4\t1,3,5', '5\t,1,2,4,6']
STEP 1 — parsed adjacency:
('1', ['2', '3', '4', '5'])
('2', ['1', '3', '5', '6'])
('3', ['1', '2', '4', '6'])
('4', ['1', '3', '5'])
('5', ['1', '2', '4', '6'])


In [15]:
inv = adj.flatMap(lambda uf: [(f, uf[0]) for f in uf[1]])
inv_grouped = inv.groupByKey().mapValues(lambda vals: sorted(set(vals))).cache()
#print(inv.take(20))
#print(inv.groupByKey().take(20))
#print(inv_grouped.take(20))

#print(inv_grouped.sortByKey(ascending=True).take(20))

inv_grouped = inv_grouped.sortByKey(ascending=True)

print(inv_grouped.take(20))

print("\nSTEP 2 — inverted index:")
for f, users in inv_grouped.collect(): print(f, ":", users)


[('1', ['2', '3', '4', '5']), ('2', ['1', '3', '5']), ('3', ['1', '2', '4']), ('4', ['1', '3', '5']), ('5', ['1', '2', '4']), ('6', ['2', '3', '5'])]

STEP 2 — inverted index:
1 : ['2', '3', '4', '5']
2 : ['1', '3', '5']
3 : ['1', '2', '4']
4 : ['1', '3', '5']
5 : ['1', '2', '4']
6 : ['2', '3', '5']


In [16]:
from itertools import combinations

In [17]:
# Emit all unordered user pairs per friend (+1 each)

pairs_by_dim = inv_grouped.flatMap(
    lambda fv: [((a, b), 1) for a, b in combinations(fv[1], 2)]
).cache()

print("\nSTEP 3 — pair contributions (+1 per shared friend):")
for p in pairs_by_dim.collect(): print(p)


STEP 3 — pair contributions (+1 per shared friend):
(('2', '3'), 1)
(('2', '4'), 1)
(('2', '5'), 1)
(('3', '4'), 1)
(('3', '5'), 1)
(('4', '5'), 1)
(('1', '3'), 1)
(('1', '5'), 1)
(('3', '5'), 1)
(('1', '2'), 1)
(('1', '4'), 1)
(('2', '4'), 1)
(('1', '3'), 1)
(('1', '5'), 1)
(('3', '5'), 1)
(('1', '2'), 1)
(('1', '4'), 1)
(('2', '4'), 1)
(('2', '3'), 1)
(('2', '5'), 1)
(('3', '5'), 1)


In [20]:
# 4) Sum contributions (this is the sparse dot-product)

mutual_counts = pairs_by_dim.reduceByKey(lambda a, b: a + b).sortByKey(ascending=True).cache()
print(mutual_counts.take(20))

print("\nSTEP 4 — mutual friend counts:")
for pair, cnt in mutual_counts.collect(): print(pair, ":", cnt)

[(('1', '2'), 2), (('1', '3'), 2), (('1', '4'), 2), (('1', '5'), 2), (('2', '3'), 2), (('2', '4'), 3), (('2', '5'), 2), (('3', '4'), 1), (('3', '5'), 4), (('4', '5'), 1)]

STEP 4 — mutual friend counts:
('1', '2') : 2
('1', '3') : 2
('1', '4') : 2
('1', '5') : 2
('2', '3') : 2
('2', '4') : 3
('2', '5') : 2
('3', '4') : 1
('3', '5') : 4
('4', '5') : 1


In [21]:
# Existing friends and remove

#print first the adjacency list
print(adj.take(20))

edges_unordered = (adj
    .flatMap(lambda uf: [tuple(sorted((uf[0], f))) for f in uf[1]])
    .distinct()
    .collect()
)
b_edges = sc.broadcast(set(edges_unordered))

nonfriends = mutual_counts.filter(lambda kv: kv[0] not in b_edges.value).cache()

print("\nSTEP 5 — non-friend candidate pairs (score = mutuals):")
for p in nonfriends.collect(): print(p)

[('1', ['2', '3', '4', '5']), ('2', ['1', '3', '5', '6']), ('3', ['1', '2', '4', '6']), ('4', ['1', '3', '5']), ('5', ['1', '2', '4', '6'])]

STEP 5 — non-friend candidate pairs (score = mutuals):
(('2', '4'), 3)
(('3', '5'), 4)


In [17]:
l=lines.map(lambda x:x.split("\t")).take(1)
l

[['1', '2,3,4,5']]

In [21]:
l[0][1]

'2,3,4,5'