## The following section is for Colab Users.
### Just run the following code cells

In [None]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://bitbucket.org/habedi/datasets/raw/b6769c4664e7ff68b001e2f43bc517888cbe3642/spark/spark-3.0.2-bin-hadoop2.7.tgz
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!rm -rf spark-3.0.2-bin-hadoop2.7.tgz*
!pip -q install findspark pyspark graphframes

In [None]:
!wget https://repos.spark-packages.org/graphframes/graphframes/0.8.2-spark3.0-s_2.12/graphframes-0.8.2-spark3.0-s_2.12.jar -P /content/spark-3.0.2-bin-hadoop2.7/jars/
!cp /content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar /content/spark-3.0.2-bin-hadoop2.7/graphframes-0.8.2-spark3.0-s_2.12.zip

In [None]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = os.environ["SPARK_HOME"]

os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

In [None]:
import findspark
findspark.init()

In [None]:
!export PYSPARK_SUBMIT_ARGS="--master local[*] pyspark-shell"
!export PYSPARK_DRIVER_PYTHON=jupyter
!export PYSPARK_DRIVER_PYTHON_OPTS=notebook

In [None]:
from pyspark.sql import SparkSession
from graphframes import *

spark = SparkSession.builder.master("local[*]").appName("GraphFrames").getOrCreate()

In [None]:
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages graphframes:graphframes:0.8.1-spark3.0-s_2.12 pyspark-shell"

**************************************************************************
**************************************************************************
**************************************************************************

In [None]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Read departuredelays.csv in Edge DataFrame
### Read airport-codes-na.txt in Vertix DataFrame (the separator is Tab i.e sep = '\t' )

#### The US flight delays data set has five columns:
- The <b>date</b> column contains an integer like 02190925 . When converted, this maps to 02-19 09:25 am.
- The <b>delay</b> column gives the delay in minutes between the scheduled and actual departure times. Early departures show negative numbers.
- The <b>distance</b> column gives the distance in miles from the origin airport to the destination airport.
- The <b>origin</b> column contains the origin IATA airport code.
- The <b>destination</b> column contains the destination IATA airport code.

#### The airport-codes data set has four columns:
- The <b>IATA</b> column contains IATA airport code.
- The <b>City, State, and Country</b> columns contains information about the airport location. 

In [None]:
edge = spark.read.csv('departuredelays.csv', header=True, inferSchema=True)

vertix = spark.read.csv('airport-codes-na.txt', sep='\t', header=True, inferSchema=True)

In [None]:
vertix.show(5)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



In [None]:
edge.show(5)

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|    6|     602|   ABE|        ATL|
|1020600|   -8|     369|   ABE|        DTW|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
+-------+-----+--------+------+-----------+
only showing top 5 rows



In [None]:
vertix.printSchema(),edge.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- IATA: string (nullable = true)

root
 |-- date: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



(None, None)

### In the vertix DataFrame, drop any duplicated rows with the same  IATA code.

In [None]:
# vertix
Vertix = vertix.dropDuplicates(subset=['IATA'])

### In the edges DataFrame:
- Rename the <b>date</b> columns to become <b>tripid</b>.
- Rename the <b>origin</b> columns to become <b>src</b>.
- Rename the <b>destination</b> columns to become <b>dst</b>.

In [None]:
Edge = edge.withColumnRenamed('date', 'tripid')\
              .withColumnRenamed('origin', 'src')\
                    .withColumnRenamed('destination', 'dst')

### In the Vertix DataFrame:
- Rename the <b>IATA</b> columns to become <b>id</b>.

In [None]:
Vertix = Vertix.withColumnRenamed('IATA', 'id')

### Create GraphFrame from Vertix and Edges DataFrames

In [None]:
graph = GraphFrame(Vertix, Edge)

In [None]:
graph.vertices.show(5)
graph.edges.show(5)

+-------------------+-----+-------+---+
|               City|State|Country| id|
+-------------------+-----+-------+---+
|         Binghamton|   NY|    USA|BGM|
|            Lebanon|   NH|    USA|LEB|
|           Montreal|   PQ| Canada|YUL|
|         Dillingham|   AK|    USA|DLG|
|International Falls|   MN|    USA|INL|
+-------------------+-----+-------+---+
only showing top 5 rows

+-------+-----+--------+---+---+
| tripid|delay|distance|src|dst|
+-------+-----+--------+---+---+
|1011245|    6|     602|ABE|ATL|
|1020600|   -8|     369|ABE|DTW|
|1021245|   -2|     602|ABE|ATL|
|1020605|   -4|     602|ABE|ATL|
|1031245|   -4|     602|ABE|ATL|
+-------+-----+--------+---+---+
only showing top 5 rows



In [None]:
graph.degrees.show(5)
graph.inDegrees.show(5)
graph.outDegrees.show(5)

+---+------+
| id|degree|
+---+------+
|PSE|   192|
|INL|    89|
|MSY| 20560|
|PPG|    27|
|GEG|  4087|
+---+------+
only showing top 5 rows

+---+--------+
| id|inDegree|
+---+--------+
|PSE|     192|
|INL|      89|
|MSY|   10283|
|PPG|      27|
|GEG|    2043|
+---+--------+
only showing top 5 rows

+---+---------+
| id|outDegree|
+---+---------+
|MSY|    10277|
|GEG|     2044|
|BUR|     5079|
|SNA|     9411|
|GRB|     1109|
+---+---------+
only showing top 5 rows



### Determine the number of airports

In [None]:
print(f'number of airports {graph.vertices.count()}')

number of airports 524


### Determine the number of trips 

In [None]:
print(f'number of trips {graph.edges.count()}')

number of trips 1391578


### What is the longest delay?

In [None]:
from pyspark.sql.functions import max as maxFn, desc, count as countFn

In [None]:
graph.edges.agg(maxFn('delay').alias('longest_delay')).show()

+-------------+
|longest_delay|
+-------------+
|         1642|
+-------------+



### Find out the number of delayed flights vs. early flights (flights that departed before actual time)

In [None]:
delayed = graph.edges.filter(graph.edges.delay > 0)
early = graph.edges.filter(graph.edges.delay <= 0)

print(f"number of early={early.count()}, number of delay={delayed.count()}")

number of early=799851, number of delay=591727


### What flight destinations departing SFO are most likely to have significant delays? Select the top 10
#### Hint: you should get the average delay for each destination for trips that depart from SFO only

In [None]:
delayed.filter(graph.edges.src == 'SFO')\
        .groupBy('src','dst').avg('delay')\
        .alias('avg delay')\
        .show(10)

+---+---+------------------+
|src|dst|        avg(delay)|
+---+---+------------------+
|SFO|TUS|39.853658536585364|
|SFO|BOI| 42.65482233502538|
|SFO|MSY|35.421052631578945|
|SFO|STL| 38.13513513513514|
|SFO|SMF|            34.936|
|SFO|MRY| 40.61764705882353|
|SFO|EUG|37.573913043478264|
|SFO|PIT|            43.875|
|SFO|ASE|44.285714285714285|
|SFO|MCI| 34.68571428571428|
+---+---+------------------+
only showing top 10 rows



### Find the Incoming connections to the airport sorted in Desc. order.

In [None]:
income = graph.inDegrees.orderBy('inDegree', ascending=False)
income.show()

+---+--------+
| id|inDegree|
+---+--------+
|ATL|   90434|
|DFW|   66050|
|ORD|   61967|
|LAX|   53601|
|DEN|   50921|
|IAH|   42700|
|PHX|   39721|
|SFO|   38988|
|LAS|   32994|
|CLT|   28388|
|MCO|   27959|
|EWR|   27652|
|LGA|   25469|
|BOS|   25360|
|SLC|   25323|
|JFK|   23484|
|DTW|   23310|
|SEA|   23074|
|MSP|   22385|
|MIA|   21805|
+---+--------+
only showing top 20 rows



### Find the Outgoing connections from the airport sorted in Desc. order.

In [None]:
outcome = graph.outDegrees.orderBy('outDegree', ascending=False)
outcome.show()

+---+---------+
| id|outDegree|
+---+---------+
|ATL|    91484|
|DFW|    68482|
|ORD|    64228|
|LAX|    54086|
|DEN|    53148|
|IAH|    43361|
|PHX|    40155|
|SFO|    39483|
|LAS|    33107|
|CLT|    28402|
|MCO|    28313|
|EWR|    27656|
|SLC|    25868|
|LGA|    25458|
|BOS|    25348|
|MSP|    24031|
|JFK|    23572|
|DTW|    23421|
|SEA|    23078|
|MIA|    21817|
+---+---------+
only showing top 20 rows



### Use motif finding to answer this question: which delays could we blame on SFO?
#### Hint: this practically means that SFO is a transit station

In [None]:
motif = graph.find("(v1)-[t1]->(v2); (v2)-[t2]->(v3)")\
              .filter("v2.id == 'SFO'")\
              .filter("t1.delay > 500 or t2.delay > 500")\
              .filter("t2.tripid > t1.tripid")\
              .filter("t2.tripid < t1.tripid + 10000")

In [None]:
motif.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  v1|                  t1|                  v2|                  t2|                  v3|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|[Albuquerque, NM,...|[1020600, 0, 779,...|[San Francisco, C...|[1021507, 536, 22...|[New York, NY, US...|
|[Albuquerque, NM,...|[1210815, -12, 77...|[San Francisco, C...|[1211508, 593, 22...|[New York, NY, US...|
|[Eureka, CA, USA,...|[1011635, -15, 21...|[San Francisco, C...|[1021507, 536, 22...|[New York, NY, US...|
|[Eureka, CA, USA,...|[1012016, -4, 217...|[San Francisco, C...|[1021507, 536, 22...|[New York, NY, US...|
|[Eureka, CA, USA,...|[1020531, -2, 217...|[San Francisco, C...|[1021507, 536, 22...|[New York, NY, US...|
|[Eureka, CA, USA,...|[1020948, -11, 21...|[San Francisco, C...|[1021507, 536, 22...|[New York, NY, US...|
|[Eureka, CA, USA,...|[1021506, -3, 2

### Determine Airport Ranking in Desc. order using PageRank algorithm

In [None]:
rank = graph.pageRank(resetProbability=0.13, maxIter=7)
rank.vertices.orderBy(rank.vertices.pagerank, acending=False).show()

+---------------+-----+-------+---+-------------------+
|           City|State|Country| id|           pagerank|
+---------------+-----+-------+---+-------------------+
|         Clovis|   NM|    USA|CVN|0.25017083543043683|
|  Fort McMurray|   AB| Canada|YMM|0.25017083543043683|
|        Jackson|   TN|    USA|MKL|0.25017083543043683|
|     Greenbrier|   WV|    USA|LWB|0.25017083543043683|
|      El Dorado|   AR|    USA|ELD|0.25017083543043683|
|Fort Saint John|   BC| Canada|YXJ|0.25017083543043683|
|      Vancouver|   BC| Canada|YVR|0.25017083543043683|
|         Quincy|   IL|    USA|UIN|0.25017083543043683|
|      Lancaster|   PA|    USA|LNS|0.25017083543043683|
|       Ironwood|   MI|    USA|IWD|0.25017083543043683|
|     Binghamton|   NY|    USA|BGM|0.25017083543043683|
|     Ogdensburg|   NY|    USA|OGS|0.25017083543043683|
|         Eureka|   KS|    USA|TOP|0.25017083543043683|
|  Rouyn-Noranda|   PQ| Canada|YUY|0.25017083543043683|
|     Bar Harbor|   ME|    USA|BHB|0.25017083543

## Determine the most popular flights (single city hops)

In [None]:
popFlights = graph.edges.groupBy("src", "dst").agg(countFn("delay").alias("popular flights")).orderBy('popular flights', ascending=False)

In [None]:
popFlights.show()

+---+---+---------------+
|src|dst|popular flights|
+---+---+---------------+
|SFO|LAX|           3232|
|LAX|SFO|           3198|
|LAS|LAX|           3016|
|LAX|LAS|           2964|
|JFK|LAX|           2720|
|LAX|JFK|           2719|
|ATL|LGA|           2501|
|LGA|ATL|           2500|
|LAX|PHX|           2394|
|PHX|LAX|           2387|
|HNL|OGG|           2380|
|OGG|HNL|           2379|
|LAX|SAN|           2215|
|SAN|LAX|           2214|
|SJC|LAX|           2208|
|LAX|SJC|           2201|
|ATL|MCO|           2136|
|MCO|ATL|           2090|
|JFK|SFO|           2084|
|SFO|JFK|           2084|
+---+---+---------------+
only showing top 20 rows



### Find and Save a Subragph that obtained from the following pattern:
#### The flight starts from an airport and return back to the same airport through 2 other airports.

In [None]:
subgraph = graph.find("(v1)-[t1]->(v2); (v2)-[t2]->(v3); (v3)-[t3]->(v1)")
              # .filter("v2.id == 'SFO'")\
              # .filter("t1.delay > 500 or t2.delay > 500")\
              # .filter("t2.tripid > t1.tripid")\
              # .filter("t2.tripid < t1.tripid + 10000")