# Graph

## 환경설정

In [None]:
!wget -q https://dlcdn.apache.org/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark graphframes

In [None]:
import findspark
findspark.init("/content/spark-3.2.4-bin-hadoop3.2")

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
            .config("spark.driver.memory", "8g")\
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12")\
            .getOrCreate()
sc = spark.sparkContext

In [None]:
from google.colab import drive
drive.mount('/gdrive')
gpath = '/gdrive/MyDrive/data/'

In [None]:
from pyspark.sql.functions import col, lit, when
from graphframes import *
import networkx as nx
import matplotlib.pylab as plt

## Pagerank

In [None]:
# Create a Vertex DataFrame with unique ID column "id"
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])

# Create an Edge DataFrame with "src" and "dst" columns
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])

# Create a GraphFrame
g = GraphFrame(v, e)

In [None]:
def PlotGraph(edge_list, figsize_=(8,5)):
    Gplot=nx.DiGraph()
    for row in edge_list.select('src','dst').collect():
        Gplot.add_edge(row['src'],row['dst'])

    plt.figure(figsize=figsize_)
    plt.subplot(121)
    nx.draw(Gplot, with_labels=True, font_weight='bold')

In [None]:
PlotGraph(g.edges)

In [None]:
g.inDegrees.show()

In [None]:
g.outDegrees.show()

In [None]:
# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.15, maxIter=5)
results.vertices.select("id", "pagerank").show()

## 예제 : Citation network

In [None]:
# load data
nodeDF = spark.read.option('header', 'false') \
                .csv(gpath+'citeseer.node_labels') \
                .toDF("id", "label")

nodeDF.show()

In [None]:
edgeDF = spark.read.option('header', 'false') \
                .csv(gpath+'citeseer.edges') \
                .toDF("src", "dst", "_")

edgeDF.show()

In [None]:
v = nodeDF.select('id')
e = edgeDF.select('src', 'dst')

# Create a GraphFrame
g = GraphFrame(v, e)

In [None]:
g.inDegrees.sort(col("inDegree").desc()).show(5)

In [None]:
g.outDegrees.sort(col("outDegree").desc()).show(5)

In [None]:
results = g.pageRank(resetProbability=0.15, maxIter=5)
results.vertices.select("id", "pagerank").sort(col("pagerank").desc()).show(5)

## 도전과제 : dolphins.csv

In [None]:
filename = 'dolphins.csv'

# load data
dolphinDF = spark.read.option('header', 'false') \
                .csv(gpath+filename) \
                .toDF("src", "_", "dst","_")

dolphinDF.show(5)

In [None]:
# WRITE YOUR CODE HERE


## 도전과제 : lesmis.csv

In [None]:
filename = 'lesmis.csv'
# load data
lesmisDF = spark.read.option('header', 'false') \
                .csv(gpath+filename) \
                .toDF("src", "_", "dst","_")

lesmisDF.show(5)

In [None]:
# WRITE YOUR CODE HERE
