## **Additional data analysis**

1) Per ogni aereoporto e per ogni settimana, percentuale di voli gestiti da ogni compagnia aerea

2)Per ogni compagnia aerea, per ogni tratta, quanti voli fa annualmente su quella tratta
Selezini comagnia aerea, anno.. per ogni tratta un bar plot con numero voli

#Per ogni aeroporto la percentuale di voli gestiti da ciascuna comagnia aerea a settimana
#Per ogni anno-compagnia aerea, per ogni tratta il valore di quante volte é stata servita

### **Initialize PySpark**

In [1]:
# Find Apache Spark on this machine
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Dev mode: False when performing real analytics
DEV = True

### threads to be used to run spark worker nodes locally
spark_local_threads = 2 

# Build a Spark SQL Session for DataFrames
master = 'local[{}]'.format(spark_local_threads)
appName = 'Airport Weekly Penalty'
spark = SparkSession \
    .builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()


#### **Load Data**

In [3]:
from preprocessing_utils import *
if DEV:
    # DEV preprocessing
    perform_DEV_dataset_preprocessing(spark)
else:
    # Production preprocessing
    perform_dataset_preprocessing(spark)

--------- DEV mode ON ---------
Starting preprocessing of ../dataset/1994.csv.bz2
Preprocessing NOT performed.
Preprocessed dataset already exists: ../dataset/preprocessed_dataset_1994.parquet



In [4]:

# Load the parquet dataset# Load t 
if DEV:
    # Load DEV dataset
    df = load_DEV_preprocessed_dataset(spark)
else:
    # Load production dataset
    df = load_preprocessed_dataset(spark)

--------- DEV mode ON ---------
Peprocessed dataset loaded.
../dataset/preprocessed_dataset_1994.parquet


In [5]:
# Keep only the dimensions we need
df = df.select(df['Year'], df['Cancelled'], \
               df['Origin'], df['Dest'],\
              df['UniqueCarrier'])
# Explore the data
df.printSchema()
df.show(10)

root
 |-- Year: integer (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)

+----+---------+------+----+-------------+
|Year|Cancelled|Origin|Dest|UniqueCarrier|
+----+---------+------+----+-------------+
|1994|        0|   CLT| ORF|           US|
|1994|        0|   CLT| ORF|           US|
|1994|        0|   CLT| ORF|           US|
|1994|        0|   CLT| ORF|           US|
|1994|        0|   CLT| ORF|           US|
|1994|        1|   CLT| ORF|           US|
|1994|        0|   CLT| ORF|           US|
|1994|        0|   CLT| ORF|           US|
|1994|        0|   CLT| ORF|           US|
|1994|        0|   CLT| ORF|           US|
+----+---------+------+----+-------------+
only showing top 10 rows



In [6]:
df.describe('UniqueCarrier').show()
df.select('UniqueCarrier').distinct().show()

+-------+-------------+
|summary|UniqueCarrier|
+-------+-------------+
|  count|      5180048|
|   mean|         null|
| stddev|         null|
|    min|           AA|
|    max|           WN|
+-------+-------------+

+-------------+
|UniqueCarrier|
+-------------+
|           UA|
|           AA|
|           NW|
|           HP|
|           TW|
|           DL|
|           US|
|           AS|
|           CO|
|           WN|
+-------------+



#### **Compute analytics**

In [7]:
import pyspark.sql.functions as F

# Drop cancelled flights
df = df.drop(df['Cancelled'] == 1)

df = df.withColumn("Count", F.lit(1))

df = df.groupBy(['Year', 'Origin', 'Dest', 'UniqueCarrier' ])                \
       .sum('Count')                               \
       .withColumnRenamed('sum(Count)', 'Count')    

df.show(10)

+----+------+----+-------------+-----+
|Year|Origin|Dest|UniqueCarrier|Count|
+----+------+----+-------------+-----+
|1994|   GSO| CLT|           US| 2906|
|1994|   PIT| BNA|           US| 1283|
|1994|   MIA| DCA|           US|  475|
|1994|   PIT| MHT|           US|  584|
|1994|   ATL| DTW|           NW| 2028|
|1994|   SDF| DTW|           NW| 1190|
|1994|   BNA| MSP|           NW|  699|
|1994|   AZO| MSP|           NW|  360|
|1994|   ORD| SDF|           UA|  922|
|1994|   ABE| MDT|           UA|  103|
+----+------+----+-------------+-----+
only showing top 10 rows



In [8]:
#Should we keep the int value or use a percentage?

In [9]:
# Store output Dataframe (or load it if already existing)
final_dataset = '../dataset/uniqueCarrier_analitics.parquet'

path= Path(final_dataset)
if not path.is_dir():
    df.write.mode('overwrite').save(final_dataset, format='parquet')
df = spark.read.load(final_dataset)

In [10]:
# Output a list of tuples of schema:
# ('Data', 'Percentage')
cancel_data = df.rdd.map(tuple).collect()
print(cancel_data[:100])

[(1994, 'JAX', 'CLT', 'US', 2137), (1994, 'TRI', 'CLT', 'US', 1711), (1994, 'PIT', 'ELM', 'US', 1062), (1994, 'TPA', 'IND', 'US', 709), (1994, 'PNS', 'TPA', 'US', 144), (1994, 'CMH', 'MDW', 'US', 472), (1994, 'SYR', 'BOS', 'US', 1107), (1994, 'BWI', 'CHS', 'US', 457), (1994, 'LAX', 'SJC', 'WN', 4157), (1994, 'MCO', 'PBI', 'TW', 28), (1994, 'ATL', 'RSW', 'TW', 400), (1994, 'JFK', 'PHX', 'TW', 335), (1994, 'JFK', 'BDL', 'TW', 56), (1994, 'SFO', 'JFK', 'UA', 2098), (1994, 'ORD', 'PDX', 'UA', 1417), (1994, 'ORD', 'GEG', 'UA', 678), (1994, 'ORD', 'DCA', 'UA', 4992), (1994, 'LAX', 'SEA', 'UA', 2127), (1994, 'HNL', 'DFW', 'DL', 333), (1994, 'ATL', 'CVG', 'DL', 3194), (1994, 'PDX', 'SEA', 'DL', 698), (1994, 'PHX', 'CVG', 'DL', 1176), (1994, 'MSP', 'ATL', 'DL', 1422), (1994, 'CVG', 'BUF', 'DL', 708), (1994, 'ATL', 'MOB', 'DL', 1755), (1994, 'BHM', 'JAN', 'DL', 350), (1994, 'EWR', 'FLL', 'DL', 489), (1994, 'ATL', 'ORD', 'AA', 1893), (1994, 'OAK', 'DFW', 'AA', 1146), (1994, 'MCI', 'DEN', 'AA', 1)

### **Data Visualization**


In [11]:
# Hide warnings if there are any
import warnings
warnings.filterwarnings('ignore')

%matplotlib ipympl

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import matplotlib.ticker as ticker



In [12]:
uniqueCarriers = ['UA', 'AA', 'NW', 'HP','TW', 'DL','US', 'AS', 'CO', 'WN']
uniqueCarrier_values = [('UA', 0), ('AA', 1) , ('NW', 2), ('HP', 3),('TW', 4), ('DL', 5), ('US', 6), ('AS', 7),  ('CO', 8), ('WN', 9)]

def get_index(carrier):
    for uc in uniqueCarrier_values:
        if uc[0] == carrier:
              return uc[1]

def get_count_df(airport_origin, airport_dest, years, df):
    rows = df.filter(F.col('Year').isin(*years))\
             .filter(F.col('Origin') == airport_origin)\
             .filter(F.col('Dest') == airport_dest)\
             .select('UniqueCarrier', 'Count' ) \
                .collect()
    
    nb_uniqueCarriers = 10
    data = np.zeros(nb_uniqueCarriers)
    for row in rows:
        uc = get_index(row[0])
        sum_count = row[1]
        data[uc] += sum_count 
        
    columns = [str(c) for c in uniqueCarriers]
    res = pd.DataFrame({'UniqueCarrier count': data}, index=uniqueCarriers)
    return res



def plot_count_carriers(airport_origin, airport_dest, years, df, ax):
    df = get_count_df( airport_origin, airport_dest, years, df)
    title = 'Unique Carrier count for Origin-Dest \ Year'
    if df.empty:
        print('No data')
    else:
        df.plot.bar( title=title, rot=90, ax=ax)

In [13]:
def ui_callback(airport_origin, airport_dest, years, df):
    plt.figure(figsize=(9,12))
    plt.clf()
    ax = plt.subplot(211)
    plot_count_carriers(airport_origin, airport_dest,range(years[0], years[1] + 1), df, ax)
    
    plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
                    wspace=0.35)
    plt.show()


# Years selection range
years = range(1994, 2009)
years = [(str(y), y) for y in years]
years_w = widgets.SelectionRangeSlider(options=years,
                                       index=(0, 2),
                                       description='Years',
                                       continuous_update=False)

In [14]:
airports_dx =['ATL','BWI','CLT','DCA','DTW','LGA','MDT','ORD','PIT'] 
#filled with the airports dest of airports_origin[0] a.k.a. 'ABE'

In [15]:
# Airports
airports_origin = df.select('Origin').distinct().orderBy('Origin').rdd.map(lambda x : x[0]).collect()
airports_dest = df.select('Dest').distinct().orderBy('Dest').rdd.map(lambda x : x[0]).collect()


# Airport selection menu
airports1_w = widgets.Dropdown(options=airports_origin,
                              value=airports_origin[0],
                              description='Airport_Origin')
airports2_w = widgets.Dropdown(options=airports_dx,
                              value=airports_dx[0],
                              description='Airport_Dest')

In [16]:
out = widgets.interactive_output(ui_callback, {'airport_origin': airports1_w,'airport_dest': airports2_w, 'years': years_w, 'df': widgets.fixed(df)})
ui = widgets.HBox([airports1_w, airports2_w, years_w])

In [17]:
def on_change(change):
    # exclude the origin ariport and all the aiports with count 0 for all the carriers
    global airports2_w
    global df
    airports_dx = df.filter(F.col('Origin') == change['new']) \
                    .select(F.col('Dest')) \
                    .orderBy('Dest') \
                    .distinct() \
                    .rdd.map(lambda x : x[0]) \
                    .collect()    
    airports2_w.options = airports_dx
    airports2_w.value = 'N/A' if not airports_dx else airports_dx[0]

airports1_w.observe(on_change, 'value')
display(ui, out)

HBox(children=(Dropdown(description='Airport_Origin', options=('ABE', 'ABQ', 'ACK', 'ACY', 'ADQ', 'AGS', 'AKN'…

Output()