# Task 3

In [0]:
dbfs_fileStore_prefix = "/FileStore/tables"
prefix = "ontimeperformance"
size = "small"


In [0]:
def task_3(spark_session, flights_path, airlines_path, aircrafts_path, country):
    from pyspark.sql import functions as func
    from pyspark.sql.functions import desc
    from pyspark.sql.functions import col
    from pyspark.sql.functions import lit
    from pyspark.sql import functions as sf
    from pyspark.sql import functions as F
    import pyspark.sql.functions as f
    from pyspark.sql.window import Window
    from pyspark.sql.functions import rank

    
    flight_path_df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("inferSchema", "true") \
                        .load(flights_path) \
    
    airlines_path_df = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load(airlines_path) \

    aircrafts_path_df = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load(aircrafts_path) \
    
    #airlines_path_df.display()
   
    airlines_df = airlines_path_df.filter(airlines_path_df.country == country)
    #airlines_df.display()
    
    flight_clean_df = flight_path_df.select(func.col('carrier_code'), func.col(' tail_number').alias('tail_number'))
    airline_clean_df = airlines_df.select(func.col('carrier_code'), func.col('name').alias('airline_name'), func.col('country'))
    aircrafts_clean_df = aircrafts_path_df.select(func.col('tailnum').alias('tail_number'), func.col('manufacturer'), func.col('model'))
    
    #data_3 = flight_clean_df.join(airline_clean_df, 'carrier_code', 'left_outer').join(aircrafts_clean_df, 'tail_number', 'left_outer')
    data_3 = airline_clean_df.join(flight_clean_df, 'carrier_code', 'left_outer')
    data_3 = data_3.join(aircrafts_clean_df, 'tail_number', 'left_outer')
    
    #data_3.display()
    data_3 = data_3.filter(data_3.manufacturer != 'null')
    
    #data_3.display()
    data_3 = data_3.filter(data_3.country != 'null')
    
    #data_3.display()
    
    data_3 = data_3.withColumn("model", f.translate(f.col("model"), "-", ""))
    
    #data_3.display()                           
    data_3 = data_3.withColumn("new_model", f.regexp_extract(data_3.model, '\d{3}', 0))

    data_3 = data_3.withColumn('aircraft_type', 
                    sf.concat(sf.col('manufacturer'),sf.lit(' '), sf.col('new_model')))
    #data_3.display()
    
    data_type = data_3.select(func.col('airline_name'), func.col('aircraft_type'))
    
    #data_type.display()
    data_type = data_type.filter(data_type.airline_name != 'null')
    data_type = data_type.orderBy(func.col('airline_name').asc())
    data_6=data_type.groupBy("airline_name","aircraft_type")\
    .count()
    data_6 = data_6.orderBy(func.col('airline_name').asc(), func.col('count').desc())
    
    #data_6.display()
    
    window = Window.partitionBy(data_6['airline_name']).orderBy(data_6['count'].desc())
    data_6 = data_6.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 5)
    #data_6.display()
    data_6 = data_6.groupBy(col("airline_name")).agg(F.collect_list(col('aircraft_type')).alias('aircraft_type'))
    
    #data_6.display()
    
    data_6.show(data_6.count(),truncate=0)
    pass

# Tuning - Broadcast Join

In [0]:
def task_3(spark_session, flights_path, airlines_path, aircrafts_path, country):
    from pyspark.sql import functions as func
    from pyspark.sql.functions import desc
    from pyspark.sql.functions import col
    from pyspark.sql.functions import lit
    from pyspark.sql import functions as sf
    from pyspark.sql import functions as F
    import pyspark.sql.functions as f
    from pyspark.sql.window import Window
    from pyspark.sql.functions import rank
    from pyspark.sql.functions import broadcast

    
    flight_path_df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("inferSchema", "true") \
                        .load(flights_path) \
    
    airlines_path_df = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load(airlines_path) \

    aircrafts_path_df = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load(aircrafts_path) \
    
    #airlines_path_df.display()
   
    airlines_df = airlines_path_df.filter(airlines_path_df.country == country)
    #airlines_df.display()
    
    flight_clean_df = flight_path_df.select(func.col('carrier_code'), func.col(' tail_number').alias('tail_number'))
    airline_clean_df = airlines_df.select(func.col('carrier_code'), func.col('name').alias('airline_name'), func.col('country'))
    aircrafts_clean_df = aircrafts_path_df.select(func.col('tailnum').alias('tail_number'), func.col('manufacturer'), func.col('model'))
    
    #data_3 = flight_clean_df.join(airline_clean_df, 'carrier_code', 'left_outer').join(aircrafts_clean_df, 'tail_number', 'left_outer')
    
    #data_3 = airline_clean_df.join(flight_clean_df, 'carrier_code', 'left_outer')
    #data_3 = data_3.join(aircrafts_clean_df, 'tail_number', 'left_outer')
    
    #flight_departure = flights.join(broadcast(airlines), flights.carrier_code == airlines.carrier_code) #Use a broadcast join instead of a left outer join
  
    #flight_departure = flights.join(airlines, 'carrier_code', 'left_outer')
    
    data_3 = flight_clean_df.join(broadcast(airline_clean_df), flight_clean_df.carrier_code == airline_clean_df.carrier_code) #Use a broadcast join instead of a left outer join
    data_3 = data_3.join(broadcast(aircrafts_clean_df), data_3.tail_number == aircrafts_clean_df.tail_number) #Use a broadcast join instead of a left outer join
    
    
    #data_3.display()
    data_3 = data_3.filter(data_3.manufacturer != 'null')
    
    #data_3.display()
    data_3 = data_3.filter(data_3.country != 'null')
    
    #data_3.display()
    
    data_3 = data_3.withColumn("model", f.translate(f.col("model"), "-", ""))
    
    #data_3.display()                           
    data_3 = data_3.withColumn("new_model", f.regexp_extract(data_3.model, '\d{3}', 0))

    data_3 = data_3.withColumn('aircraft_type', 
                    sf.concat(sf.col('manufacturer'),sf.lit(' '), sf.col('new_model')))
    #data_3.display()
    
    data_type = data_3.select(func.col('airline_name'), func.col('aircraft_type'))
    
    #data_type.display()
    data_type = data_type.filter(data_type.airline_name != 'null')
    data_type = data_type.orderBy(func.col('airline_name').asc())
    data_6=data_type.groupBy("airline_name","aircraft_type")\
    .count()
    data_6 = data_6.orderBy(func.col('airline_name').asc(), func.col('count').desc())
    
    #data_6.display()
    
    window = Window.partitionBy(data_6['airline_name']).orderBy(data_6['count'].desc())
    data_6 = data_6.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 5)
    
    data_6 = data_6.groupBy(col("airline_name")).agg(F.collect_list(col('aircraft_type')).alias('aircraft_type'))
    
    #data_6.display()
    
    data_6.show(data_6.count(),truncate=0)
    
    pass

#Tuning - Caching

In [0]:
def task_3(spark_session, flights_path, airlines_path, aircrafts_path, country):
    from pyspark.sql import functions as func
    from pyspark.sql.functions import desc
    from pyspark.sql.functions import col
    from pyspark.sql.functions import lit
    from pyspark.sql import functions as sf
    from pyspark.sql import functions as F
    import pyspark.sql.functions as f
    from pyspark.sql.window import Window
    from pyspark.sql.functions import rank

    
    flight_path_df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("inferSchema", "true") \
                        .load(flights_path) \
    
    airlines_path_df = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load(airlines_path) \

    aircrafts_path_df = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load(aircrafts_path) \
    
    #airlines_path_df.display()
   
    airlines_df = airlines_path_df.filter(airlines_path_df.country == country)
    #airlines_df.display()
    
    flight_clean_df = flight_path_df.select(func.col('carrier_code'), func.col(' tail_number').alias('tail_number'))
    airline_clean_df = airlines_df.select(func.col('carrier_code'), func.col('name').alias('airline_name'), func.col('country'))
    aircrafts_clean_df = aircrafts_path_df.select(func.col('tailnum').alias('tail_number'), func.col('manufacturer'), func.col('model'))
    
    flight = flight_clean_df
    flight.cache()
    airline = airline_clean_df
    airline.cache()
    aircrafts = aircrafts_clean_df
    aircrafts.cache()
    
    #data_3 = flight_clean_df.join(airline_clean_df, 'carrier_code', 'left_outer').join(aircrafts_clean_df, 'tail_number', 'left_outer')
    data_3 = airline.join(flight, 'carrier_code', 'left_outer')
    data_3 = data_3.join(aircrafts, 'tail_number', 'left_outer')
    
    #data_3.display()
    data_3 = data_3.filter(data_3.manufacturer != 'null')
    
    #data_3.display()
    data_3 = data_3.filter(data_3.country != 'null')
    
    #data_3.display()
    
    data_3 = data_3.withColumn("model", f.translate(f.col("model"), "-", ""))
    
    #data_3.display()                           
    data_3 = data_3.withColumn("new_model", f.regexp_extract(data_3.model, '\d{3}', 0))

    data_3 = data_3.withColumn('aircraft_type', 
                    sf.concat(sf.col('manufacturer'),sf.lit(' '), sf.col('new_model')))
    #data_3.display()
    
    data_type = data_3.select(func.col('airline_name'), func.col('aircraft_type'))
    
    #data_type.display()
    data_type = data_type.filter(data_type.airline_name != 'null')
    data_type = data_type.orderBy(func.col('airline_name').asc())
    data_6=data_type.groupBy("airline_name","aircraft_type")\
    .count()
    data_6 = data_6.orderBy(func.col('airline_name').asc(), func.col('count').desc())
    
    #data_6.display()
    
    window = Window.partitionBy(data_6['airline_name']).orderBy(data_6['count'].desc())
    data_6 = data_6.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 5)
    
    data_6 = data_6.groupBy(col("airline_name")).agg(F.collect_list(col('aircraft_type')).alias('aircraft_type'))
    
    #data_6.display()
    
    data_6.show(data_6.count(),truncate=0)
    
    pass

In [0]:
task_3(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_{size}.csv", 
                                f"{dbfs_fileStore_prefix}/{prefix}_airlines.csv", 
                                f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv", "United States")