In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
!tar xf spark-3.3.1-bin-hadoop3.tgz
# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"


# install findspark using pip
!pip install -q findspark pyspark_dist_explore
import findspark
findspark.init()

In [5]:
# First import SparkConf and SparkContext from pyspark module
from pyspark import SparkConf, SparkContext

# Then, set SparkConf by setting up master as local(means stanalone local) and app Name
sConf = SparkConf().setMaster("local").setAppName("DegreesOfSeparation")
# Then, set SparkContext based on the SparkConf
sContext = SparkContext.getOrCreate(conf = sConf)

In [6]:
# The characters we wish to find the degree of separation between:
startCharacterID = 5306 #5306 spider man
targetCharacterID = 14923 #14 "ADAM 3,031"

# Our accumulator, used to signal when we find the target character duringour BFS traversal.
hitCounter = sContext.accumulator(0)

In [7]:
def convertToBFS(line):
    fields = line.split()
    heroID = int(fields[0])
    connections = [0]
    for connection in fields[1:]:
        connections.append(int(connection))
        
    color = 'WHITE'
    distance = 9999
    
    if(heroID == startCharacterID):
        color = 'GRAY'
        distance = 0
    
    return (heroID,(connections,distance,color))
        

In [8]:
def createStartingRDD():
    inputFile = sContext.textFile("/content/drive/MyDrive/BigData/marvel-network.txt")
    return inputFile.map(convertToBFS)

In [9]:
def bfsMap(node):
    characterID = node[0]
    data = node[1]
    connections = data[0]
    distance = data[1]
    color = data[2]
    
    results = []
    
#     if this node need to be expanded
    if(color == 'GRAY'):
        for connection in connections:
            newCharacterID = connection
            newDistance = distance + 1
            newColor = 'GRAY'
            if(targetCharacterID == connection):
                hitCounter.add(1)
            
            newEntry = (newCharacterID ,([],newDistance,newColor))
            results.append(newEntry)
#         this node has been processed
        color = 'BLACK'
#Emit the input node so we don't lose it.
    results.append((characterID,(connections,distance,color)))
    return results


In [10]:
def bfsReduce(data1,data2):
    edges1 = data1[0] #cạnh node 1
    edges2 = data2[0] #cạnh node 2
    distance1 = data1[1]
    distance2 = data2[1]
    color1 = data1[2]
    color2 = data2[2]
    
    distance = 9999
    color = 'WHITE'
    edges = []
# See if one is the original node with its connections.
# If so preserve them.   
# nếu k có cạnh nào thì coi node đó là gốc 
    if(len(edges1) > 0):
        edges = edges1
    elif (len(edges2) > 0):
        edges = edges2
# Preserve minimum distance    
    if(distance1 < distance):
        distance = distance1   
    if(distance2 < distance):
        distance = distance2
# Preserve darkest color    
    # if(color1 == 'WHITE'and (color2 == 'GRAY' or color2 == 'BLACK')):
    #     color = color2
    # if(color2 == 'GRAY' and color2 == 'BLACK'):
    #     color = color2
        # Preserve darkest color
    if (color1 == 'WHITE' and (color2 == 'GRAY' or color2 == 'BLACK')):
        color = color2

    if (color1 == 'GRAY' and color2 == 'BLACK'):
        color = color2

    if (color2 == 'WHITE' and (color1 == 'GRAY' or color1 == 'BLACK')):
        color = color1

    if (color2 == 'GRAY' and color1 == 'BLACK'):
        color = color1
        
    return (edges, distance, color)

In [11]:
#Main program here:
iterationRdd = createStartingRDD()
results = iterationRdd.collect()
for result in results:
    print( result)

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
(3636, ([0, 18237, 18238, 8516, 11084, 11085, 8532, 11339, 11688, 11689, 11690, 10522, 6821, 6822, 6823, 6824], 9999, 'WHITE'))
(3636, ([0, 11693, 8533, 11694, 11695, 8536, 11696, 11697, 8517, 6845, 8538, 8542, 11698, 15310, 7786, 18239], 9999, 'WHITE'))
(3636, ([0, 11047, 6779, 6777, 6780, 6776, 11699, 11700, 14932, 10396, 11701, 9161, 7333, 12165, 10533, 7334], 9999, 'WHITE'))
(3637, ([0, 7471, 7472, 7473, 7474, 7481, 7335, 11785, 7504, 7505, 9848, 9850, 9849, 17825, 9125, 13949], 9999, 'WHITE'))
(3637, ([0, 9285, 13508, 13509, 13510, 13900, 15187], 9999, 'WHITE'))
(3638, ([0, 9243, 7495, 7496, 7499], 9999, 'WHITE'))
(3639, ([0, 12226], 9999, 'WHITE'))
(3640, ([0, 11820, 15024], 9999, 'WHITE'))
(3641, ([0, 13075, 8370, 8371, 8372, 14501, 14502, 7572, 7573, 7575, 7576, 7577, 7175, 9218, 6770], 9999, 'WHITE'))
(3642, ([0, 10494, 10495, 10496, 10497, 10498, 10501, 10500], 9999, 'WHITE'))
(3643, ([0, 10209], 9999, 'WH

In [12]:
for iteration in range(0,10):
    print ("Running BFS iteration" + str(iteration+1))
# Create new vertices as needed to darken or reduce distances in the
# reduce stage. If we encounter the node we're looking for as a GRAY
# node, increment our accumulator to signal that we're done.
# apply bfsMap function and return a new dataframe
    mapped = iterationRdd.flatMap(bfsMap)
    
# Note that mapped.count() action here forces the RDD to be evaluated, and that's the only reason our accumulator is actually updated.
    print("Processing " + str(mapped.count()) +" values")
    
    if(hitCounter.value > 0):
        print("Hit the target character! From " + str(hitCounter.value) \
             +" Different direction(s)")
        break

# Reducer combines data for each character ID, preserving the darkest color and shortest path.
    iterationRdd = mapped.reduceByKey(bfsReduce)
#     f = open('output.txt', 'w')

    results = iterationRdd.collect()
#     for result in results:
#         print (result)
#         f.write(str(result))

Running BFS iteration1
Processing 12824 values
Hit the target character! From 1 Different direction(s)
