In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

!pip install graphframes

Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl.metadata (934 bytes)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nose, graphframes
Successfully installed graphframes-0.6 nose-1.3.7


In [None]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame

In [None]:
spark = SparkSession.builder.config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12").getOrCreate()

### Read departuredelays.csv in Edge DataFrame
### Read airport-codes-na.txt in Vertix DataFrame (the separator is Tab i.e sep = '\t' )

#### The US flight delays data set has five columns:
- The <b>date</b> column contains an integer like 02190925 . When converted, this maps to 02-19 09:25 am.
- The <b>delay</b> column gives the delay in minutes between the scheduled and actual departure times. Early departures show negative numbers.
- The <b>distance</b> column gives the distance in miles from the origin airport to the destination airport.
- The <b>origin</b> column contains the origin IATA airport code.
- The <b>destination</b> column contains the destination IATA airport code.

#### The airport-codes data set has four columns:
- The <b>IATA</b> column contains IATA airport code.
- The <b>City, State, and Country</b> columns contains information about the airport location.

In [None]:
edg_df=spark.read.csv("/content/drive/MyDrive/Data (1)/Data/departuredelays.csv",header=True)
ver_df=spark.read.csv("/content/drive/MyDrive/Data (1)/Data/airport-codes-na.txt",sep='\t',header=True)

In [None]:
edg_df.show(5)

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
+--------+-----+--------+------+-----------+
only showing top 5 rows



In [None]:
ver_df.show(5)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



### In the vertix DataFrame, drop any duplicated rows with the same  IATA code.

In [None]:
dropped_df=ver_df.dropDuplicates(["IATA"])

### In the edges DataFrame:
- Rename the <b>date</b> columns to become <b>tripid</b>.
- Rename the <b>origin</b> columns to become <b>src</b>.
- Rename the <b>destination</b> columns to become <b>dst</b>.

In [None]:
tipid = edg_df.withColumnRenamed("date","tripid")
src = tipid.withColumnRenamed("origin","src")
dst = src.withColumnRenamed("destination","dst")

### In the Vertix DataFrame:
- Rename the <b>IATA</b> columns to become <b>id</b>.

In [None]:
id = ver_df.withColumnRenamed("IATA","id")

### Create GraphFrame from Vertix and Edges DataFrames

In [None]:
graph=GraphFrame(id,dst)



In [None]:
graph.vertices.show()

+-----------+-----+-------+---+
|       City|State|Country| id|
+-----------+-----+-------+---+
| Abbotsford|   BC| Canada|YXX|
|   Aberdeen|   SD|    USA|ABR|
|    Abilene|   TX|    USA|ABI|
|      Akron|   OH|    USA|CAK|
|    Alamosa|   CO|    USA|ALS|
|     Albany|   GA|    USA|ABY|
|     Albany|   NY|    USA|ALB|
|Albuquerque|   NM|    USA|ABQ|
| Alexandria|   LA|    USA|AEX|
|  Allentown|   PA|    USA|ABE|
|   Alliance|   NE|    USA|AIA|
|     Alpena|   MI|    USA|APN|
|    Altoona|   PA|    USA|AOO|
|   Amarillo|   TX|    USA|AMA|
|Anahim Lake|   BC| Canada|YAA|
|  Anchorage|   AK|    USA|ANC|
|   Appleton|   WI|    USA|ATW|
|     Arviat|  NWT| Canada|YEK|
|  Asheville|   NC|    USA|AVL|
|      Aspen|   CO|    USA|ASE|
+-----------+-----+-------+---+
only showing top 20 rows



In [None]:
graph.edges.show()

+--------+-----+--------+---+---+
|  tripid|delay|distance|src|dst|
+--------+-----+--------+---+---+
|01011245|    6|     602|ABE|ATL|
|01020600|   -8|     369|ABE|DTW|
|01021245|   -2|     602|ABE|ATL|
|01020605|   -4|     602|ABE|ATL|
|01031245|   -4|     602|ABE|ATL|
|01030605|    0|     602|ABE|ATL|
|01041243|   10|     602|ABE|ATL|
|01040605|   28|     602|ABE|ATL|
|01051245|   88|     602|ABE|ATL|
|01050605|    9|     602|ABE|ATL|
|01061215|   -6|     602|ABE|ATL|
|01061725|   69|     602|ABE|ATL|
|01061230|    0|     369|ABE|DTW|
|01060625|   -3|     602|ABE|ATL|
|01070600|    0|     369|ABE|DTW|
|01071725|    0|     602|ABE|ATL|
|01071230|    0|     369|ABE|DTW|
|01070625|    0|     602|ABE|ATL|
|01071219|    0|     569|ABE|ORD|
|01080600|    0|     369|ABE|DTW|
+--------+-----+--------+---+---+
only showing top 20 rows



### Determine the number of airports

In [None]:
num_airports = graph.vertices.count()
print("Number of airports:", num_airports)

Number of airports: 526


### Determine the number of trips

In [None]:
num_trips = graph.edges.count()
print("Number of trips:", num_trips)

Number of trips: 1391578


### What is the longest delay?

In [None]:
lonest_delay = graph.edges.agg({"delay": "max"}).collect()[0][0]
print("Longest delay:", lonest_delay)

Longest delay: 995


### Find out the number of delayed flights vs. early flights (flights that departed before actual time)

In [None]:
num_delayed = graph.edges.filter("delay > 0").count()
num_early = graph.edges.filter("delay < 0").count()
print("Number of delayed flights:", num_delayed)
print("Number of early flights:", num_early)

Number of delayed flights: 591727
Number of early flights: 668729


### What flight destinations departing SFO are most likely to have significant delays? Select the top 10
#### Hint: you should get the average delay for each destination for trips that depart from SFO only

In [None]:
sfo_destinations = graph.edges.filter("src = 'SFO'").groupBy("dst").agg({"delay": "avg"}).orderBy("avg(delay)", ascending=False).limit(10)
sfo_destinations.show()

+---+------------------+
|dst|        avg(delay)|
+---+------------------+
|JAC| 30.78846153846154|
|OKC|24.822222222222223|
|SUN|22.696629213483146|
|COS| 22.58888888888889|
|SAT|             22.16|
|STL|         20.203125|
|HNL|19.982608695652175|
|ASE|19.846153846153847|
|CEC|19.089820359281436|
|MDW|18.771929824561404|
+---+------------------+



### Find the Incoming connections to the airport sorted in Desc. order.

In [None]:
incoming_connections = graph.inDegrees.orderBy("inDegree", ascending=False)
incoming_connections.show()



+---+--------+
| id|inDegree|
+---+--------+
|ATL|   90434|
|DFW|   66050|
|ORD|   61967|
|LAX|   53601|
|DEN|   50921|
|IAH|   42700|
|PHX|   39721|
|SFO|   38988|
|LAS|   32994|
|CLT|   28388|
|MCO|   27959|
|EWR|   27652|
|LGA|   25469|
|BOS|   25360|
|SLC|   25323|
|JFK|   23484|
|DTW|   23310|
|SEA|   23074|
|MSP|   22385|
|MIA|   21805|
+---+--------+
only showing top 20 rows



### Find the Outgoing connections from the airport sorted in Desc. order.

In [None]:
outgoing_connections = graph.outDegrees.orderBy("outDegree", ascending=False)
outgoing_connections.show()



+---+---------+
| id|outDegree|
+---+---------+
|ATL|    91484|
|DFW|    68482|
|ORD|    64228|
|LAX|    54086|
|DEN|    53148|
|IAH|    43361|
|PHX|    40155|
|SFO|    39483|
|LAS|    33107|
|CLT|    28402|
|MCO|    28313|
|EWR|    27656|
|SLC|    25868|
|LGA|    25458|
|BOS|    25348|
|MSP|    24031|
|JFK|    23572|
|DTW|    23421|
|SEA|    23078|
|MIA|    21817|
+---+---------+
only showing top 20 rows



### Use motif finding to answer this question: which delays could we blame on SFO?
#### Hint: this practically means that SFO is a transit station

In [None]:
blame_on_sfo = graph.find("(a)-[ab]->(b); (b)-[bc]->(c); !(c)-[]->(a)")
blame_on_sfo.show(2)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

### Determine Airport Ranking in Desc. order using PageRank algorithm

In [None]:
airport_ranking=graph.pageRank(resetProbability=0.15, maxIter=5)
airport_ranking.vertices.orderBy("pagerank", ascending=False).show()

## Determine the most popular flights (single city hops)

In [None]:
most_popular_flights=graph.edges.groupBy("src", "dst").count().orderBy("count", ascending=False)
most_popular_flights.show()

### Find and Save a Subragph that obtained from the following pattern:
#### The flight starts from an airport and return back to the same airport through 2 other airports.

In [None]:
subgraph=graph.find("(a)-[]->(b); (b)-[]->(c); (c)-[]->(a)")
subgraph.show()


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 