In [1]:
from pyspark import SparkConf, SparkContext
import time

conf = SparkConf()
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [2]:
# indexed 1 ~ 100
data_fname = "data/graph-full.txt"
data = (sc
    .textFile(data_fname)
    .map(lambda line : map(int, line.split('\t')))
    .map(lambda edge: (tuple(edge), 1))
    .reduceByKey(lambda x, y: 1)
    .map(lambda point: point[0]))

iteration = 40
n = 1000

#### Page Rank

In [3]:
time_start = time.time()

beta = 0.8

# initialize uniform ranking
r = {i + 1 : 1.0 / n for i in range(n)}

# column must sum to 1, sum (j, i) over j equal to 1
# not that x[0] is column index, x[1] is row index
a = data.map(lambda x: (x[0], 1)) # (col, 1)
b = a.reduceByKey(lambda x, y: x + y) # (col, nonzero count)
degree = {rdd[0] : rdd[1] for rdd in b.collect()} # col -> nonzero count

# key: (r, c), val: (1 / col nonzero count)
m = data.map(lambda edge: ((edge[1], edge[0]), 1.0 / degree[edge[0]]))

for _ in range(iteration):
    m2 = m.map(lambda rdd: (rdd[0][0], rdd[1] * r[rdd[0][1]]))
    m3 = m2.reduceByKey(lambda x, y: x + y)
    r_next = m3.map(lambda x: (x[0], x[1] * beta + (1 - beta) / n))
    r = {x[0] : x[1] for x in r_next.collect()}

ranking = sorted(r_next.collect(), key=lambda rdd : -rdd[1])
print("\nTop 5 (DESC):")
for idx, score in ranking[:5]:
    print(idx, score)

print("\nBottom  (ASC):")
for idx, score in ranking[-5:][::-1]:
    print(idx, score)
    
time_end = time.time()
print("processing completed, time elapsed: %.2fs\n"%(time_end - time_start))


Top 5 (DESC):
263 0.002020291181518219
537 0.0019433415714531497
965 0.0019254478071662631
243 0.001852634016241731
285 0.001827372170064514

Bottom  (ASC):
558 0.0003286018525215297
93 0.0003513568937516577
62 0.00035314810510596274
424 0.0003548153864930145
408 0.00038779848719291705
processing completed, time elapsed: 9.44s



#### HITS

In [4]:
time_start = time.time()

L = data
h = {i + 1 : 1.0 for i in range(n)}

for _ in range(iteration):
    # L.T * h
    a_rdd = (L
        .map(lambda rc : (rc[1], 1 * h[rc[0]])) 
        .reduceByKey(lambda x, y : x + y))

    # normalize
    a_max = max(a_rdd.collect(), key=lambda x : x[1])
    a_rdd = a_rdd.map(lambda x : (x[0], x[1] / a_max[1]))
    a = {x[0] : x[1] for x in a_rdd.collect()}

    # h = La
    h_rdd = (L
        .map(lambda rc : (rc[0], 1 * a[rc[1]]))
        .reduceByKey(lambda x, y : x + y))
    
    # normalize
    h_max = max(h_rdd.collect(), key=lambda x : x[1])
    h_rdd = h_rdd.map(lambda x : (x[0], x[1] / h_max[1]))
    h = {x[0] : x[1] for x in h_rdd.collect()}

print("\nTop 5 authority (DESC):")
for idx, score in a_rdd.top(5, key=lambda x: x[1]):
    print(idx, score)

print("\nBottom 5 authority (ASC):")
for idx, score in a_rdd.top(5, key=lambda x: -x[1]):
    print(idx, score)

print("\nTop 5 hubbiness (DESC):")
for idx, score in h_rdd.top(5, key=lambda x: x[1]):
    print(idx, score)

print("\nBottom 5 hubbiness (ASC):")
for idx, score in h_rdd.top(5, key=lambda x: -x[1]):
    print(idx, score)
    
time_end = time.time()
print("processing completed, time elapsed: %.2fs\n"%(time_end - time_start))


Top 5 authority (DESC):
893 1.0
16 0.9635572849634398
799 0.9510158161074017
146 0.9246703586198444
473 0.8998661973604051

Bottom 5 authority (ASC):
19 0.05608316377607618
135 0.06653910487622795
462 0.07544228624641901
24 0.08171239406816945
910 0.08571673456144878

Top 5 hubbiness (DESC):
840 1.0
155 0.9499618624906541
234 0.8986645288972266
389 0.8634171101843793
472 0.8632841092495219

Bottom 5 hubbiness (ASC):
23 0.042066854890936534
835 0.05779059354433016
141 0.0645311764622518
539 0.06602659373418493
889 0.07678413939216454
processing completed, time elapsed: 18.28s



In [5]:
sc.stop()