## **Number of carries' flights done per year for each origin-dest pair**

**Request** <br/>
In this notebook we are counting the yearly number of flights provided by each carrier for each route. 

In the other notebooks different aspects of the dataset were analysed but no info about the carriers were retrieved so we decided to focus on that feature.<br/>
We decided to keep the absolute value of flights managed by each carrier and not to use the percentage because we noticed a very deep variance of the possible results, some routes are much more covered than others.

### **Initialize PySpark**

In [1]:
# Find Apache Spark on this machine
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Dev mode: False when performing real analytics
DEV = False

### threads to be used to run spark worker nodes locally
spark_local_threads = 2 

# Build a Spark SQL Session for DataFrames
master = 'local[{}]'.format(spark_local_threads)
appName = 'Airport Weekly Penalty'
spark = SparkSession \
    .builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()


#### **Load Data**

In [3]:
from preprocessing_utils import *
if DEV:
    # DEV preprocessing
    perform_DEV_dataset_preprocessing(spark)
else:
    # Production preprocessing
    perform_dataset_preprocessing(spark)

Starting preprocessing of ../dataset/*.csv.bz2
Preprocessing NOT performed.
Preprocessed dataset already exists: ../dataset/preprocessed_dataset.parquet



In [4]:
# Load the parquet dataset# Load t 
if DEV:
    # Load DEV dataset
    df = load_DEV_preprocessed_dataset(spark)
else:
    # Load production dataset
    df = load_preprocessed_dataset(spark)

Peprocessed dataset loaded.
../dataset/preprocessed_dataset.parquet


In [5]:
# Keep only the dimensions we need
df = df.select(df['Year'], df['Cancelled'],df['Origin'], df['Dest'], df['UniqueCarrier'])
# Explore the data
#df.printSchema()
#df.show(10)

In [6]:
#df.select('UniqueCarrier').distinct().show(30, False)

#### **Compute analytics**

In [7]:
import pyspark.sql.functions as F

# Drop cancelled flights
df = df.drop(df['Cancelled'] == 1)

df = df.withColumn("Count", F.lit(1))

df = df.groupBy(['Year', 'Origin', 'Dest', 'UniqueCarrier' ])                \
       .sum('Count')                               \
       .withColumnRenamed('sum(Count)', 'Count')    

df.show(10)

+----+------+----+-------------+-----+
|Year|Origin|Dest|UniqueCarrier|Count|
+----+------+----+-------------+-----+
|2007|   BNA| TPA|           WN| 1756|
|2007|   LAX| OAK|           WN| 7496|
|2007|   PHX| DTW|           WN|  667|
|2007|   LAS| OKC|           YV|  123|
|2007|   CMH| DCA|           OH|   91|
|2007|   ORD| CVG|           OH| 1878|
|2007|   BOS| BWI|           OH| 2562|
|2007|   CVG| RIC|           OH|  628|
|2007|   SMF| SLC|           OO|  990|
|2007|   ACV| SFO|           OO| 2887|
+----+------+----+-------------+-----+
only showing top 10 rows



In [8]:
# Store output Dataframe (or load it if already existing)
final_dataset = '../dataset/uniqueCarrier_analitics.parquet'

path= Path(final_dataset)
if not path.is_dir():
    df.write.mode('overwrite').save(final_dataset, format='parquet')
df = spark.read.load(final_dataset)

In [9]:
# Output a list of tuples of schema:
# ('Data', 'Percentage')
cancel_data = df.rdd.map(tuple).collect()
#print(cancel_data[:100])

In [10]:
#We create a dictionary where for each origin there are associated all the possible destinations
origins = df.select(F.col('Origin')) \
                    .orderBy('Origin') \
                    .distinct() \
                    .rdd.map(lambda x : x[0]) \
                    .collect()  
originDestDict= {}
for origin in origins:
    airports_temp = df.filter(F.col('Origin') == origin) \
                    .select(F.col('Dest')) \
                    .orderBy('Dest') \
                    .distinct() \
                    .rdd.map(lambda x : x[0]) \
                    .collect()    
    originDestDict[origin] = airports_temp 

In [11]:
# Store output Dictionary (or load it if already existing)
import csv

final_dict = '../dataset/origin_dest_dictionary.csv'

path = Path(final_dict)

if not path.is_file():
    with open(final_dict, 'w') as csvfile:
        csv_writer = csv.writer(csvfile)
        for key, val in originDestDict.items():
            csv_writer.writerow([key, val])
else:
    for key, val in csv.reader(open(path)):
        originDestDict[key] = val

### **Data Visualization**


In [12]:
# Hide warnings if there are any
import warnings
warnings.filterwarnings('ignore')

%matplotlib ipympl

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import matplotlib.ticker as ticker



In [13]:
#List of all possible carriers
uniqueCarriers = ['UA', 'AA', 'NW', 'EV', 'B6', 'HP','TW', 'DL', 'OO', 'F9', 'YV', 'TZ', 'US', 'AQ', 'MQ',
                  'OH', 'HA', 'XE', 'DH', 'AS', 'FL', 'CO', 'WN', '9E']

uniqueCarrier_values = [('UA', 0), ('AA', 1) , ('NW', 2), ('EV', 3), ('B6', 4), ('HP', 5),('TW', 6), ('DL', 7), ('OO', 8), 
                        ('F9', 9), ('YV', 10), ('TZ', 11), ('US', 12), ('AQ', 13), ('MQ', 14), ('OH', 15), ('HA', 16), 
                        ('XE', 17), ('DH', 18),('AS', 19), ('FL', 20), ('CO', 21), ('WN', 22), ('9E', 23)]

def get_index(carrier):
    for uc in uniqueCarrier_values:
        if uc[0] == carrier:
              return uc[1]

def get_count_df(airport_origin, airport_dest, years, df):
    rows = df.filter(F.col('Year').isin(*years))\
             .filter(F.col('Origin') == airport_origin)\
             .filter(F.col('Dest') == airport_dest)\
             .select('UniqueCarrier', 'Count' ) \
                .collect()
    
    nb_uniqueCarriers = 24
    data = np.zeros(nb_uniqueCarriers)
    for row in rows:
        uc = get_index(row[0])
        sum_count = row[1]
        data[uc] += sum_count 
        
    columns = [str(c) for c in uniqueCarriers]
    res = pd.DataFrame({'UniqueCarrier count': data}, index=uniqueCarriers)
    return res



def plot_count_carriers(airport_origin, airport_dest, years, df, ax):
    df = get_count_df( airport_origin, airport_dest, years, df)
    title = 'Unique Carrier count for Origin-Dest and Year'
    if df.empty:
        print('No data')
    else:
        df.plot.bar( title=title, rot=90, ax=ax)

In [14]:
def ui_callback(airport_origin, airport_dest, years, df):
    plt.figure(figsize=(9,12))
    plt.clf()
    ax = plt.subplot(211)
    plot_count_carriers(airport_origin, airport_dest,range(years[0], years[1] + 1), df, ax)
    
    plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
                    wspace=0.35)
    plt.show()


# Years selection range
years = range(1994, 2009)
years = [(str(y), y) for y in years]
years_w = widgets.SelectionRangeSlider(options=years,
                                       index=(0, 2),
                                       description='Years',
                                       continuous_update=False)

In [15]:
#filled with the airports dest of airports_origin[0] a.k.a. 'ABE'
airports_dx = df.filter(F.col('Origin') == 'ABE').select(F.col('Dest')).distinct().rdd.map(lambda x : x[0]).collect()

In [16]:
# Airports
airports_origin = df.select('Origin').distinct().orderBy('Origin').rdd.map(lambda x : x[0]).collect()
airports_dest = df.select('Dest').distinct().orderBy('Dest').rdd.map(lambda x : x[0]).collect()


# Airport selection menu
airports1_w = widgets.Dropdown(options=airports_origin,
                              value=airports_origin[0],
                              description='Airport_Origin')
airports2_w = widgets.Dropdown(options=airports_dx,
                              value=airports_dx[0],
                              description='Airport_Dest')

In [17]:
out = widgets.interactive_output(ui_callback, {'airport_origin': airports1_w,'airport_dest': airports2_w, 'years': years_w, 'df': widgets.fixed(df)})
ui = widgets.HBox([airports1_w, airports2_w, years_w])

We allow to visualize in the Aiport_Dest only the airports that are destination of flights that start from the selected Airport_Origin.<br/>
N.B.: if the graph is empty means that for the selected years that are no flights, selecting all the available years at least a bar will be visualized.

In [18]:
def on_change(change):
    # exclude the origin ariport and all the aiports with count 0 for all the carriers
    global airports2_w
    global df
    airports_dx = originDestDict[change['new']]   
    airports2_w.options = airports_dx
    airports2_w.value = 'N/A' if not airports_dx else airports_dx[0]

airports1_w.observe(on_change, 'value')
display(ui, out)

HBox(children=(Dropdown(description='Airport_Origin', options=('ABE', 'ABI', 'ABQ', 'ABY', 'ACK', 'ACT', 'ACV'…

Output()