# Task 1 Final

In [None]:
dbfs_fileStore_prefix = "/FileStore/tables"
prefix = "ontimeperformance"
size = "massive"
#

In [None]:
#Dataframe, original
def task_1(spark_session, flights_path, aircrafts_path):
  import pyspark
  from pyspark.sql.functions import desc
  from pyspark.sql import functions as F
  from pyspark.sql.functions import regexp_extract
  from pyspark.sql.functions import lit
  from pyspark.sql import functions as sf
  from pyspark.sql.functions import initcap


  aircrafts_df = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load(aircrafts_path)

  flights_df = spark.read.format("csv") \
                          .option("header", "true") \
                          .option("inferSchema", "true") \
                          .load(flights_path)


  model_df = aircrafts_df.select(F.col('tailnum').alias('tail_number'),F.col('manufacturer').alias('manufacturer'),F.col('model').alias('model'))

  model_clean_df = model_df.filter(model_df.model != "null")
  model_clean_df = model_df.filter(model_df.manufacturer == "CESSNA")


  flightnum_df = flights_df.select(flights_df.columns[1], flights_df.columns[5])
  flightnum_df = flightnum_df.withColumnRenamed(flightnum_df.columns[0], "flight_number")
  flightnum_df = flightnum_df.withColumnRenamed(flightnum_df.columns[1], "tail_number")
  flightnum_clean_df = flightnum_df.filter(flightnum_df.tail_number != "null")


  model_flight = model_clean_df.join(flightnum_clean_df, 'tail_number','left_outer')

  model_flight_count = model_flight.groupBy('tail_number')\
                                    .count() \
                                     # .orderBy("count" , ascending = False)

  full_model = model_flight_count.join(model_clean_df, 'tail_number','left_outer')

  top3 = full_model.withColumn("new_model", regexp_extract(full_model.model, '\d{3}', 0))
  top3 = top3.orderBy('count', ascending = False)


  top3_combined = top3.withColumn('Flights', 
                      sf.concat(sf.col('manufacturer'),sf.lit(' '), sf.col('new_model')))

  final = top3_combined.select(initcap("Flights")).limit(3)
  display(final)


pass




In [None]:
#dataframe optimisation with broadcast

def task_1(spark_session, flights_path, aircrafts_path):
  import pyspark
  from pyspark.sql.functions import desc
  from pyspark.sql import functions as F
  from pyspark.sql.functions import regexp_extract
  from pyspark.sql.functions import lit
  from pyspark.sql import functions as sf
  from pyspark.sql.functions import initcap
  from pyspark.sql import SQLContext
  from pyspark import SparkContext, SparkConf
  from pyspark.sql.functions import broadcast


  
  conf = SparkConf().setMaster("local").setAppName("Task1")
  sc = SparkContext.getOrCreate(conf=conf)
  sqlContext = SQLContext(sc)
  
  aircrafts_clean = sqlContext.read.csv(aircrafts_path, header = True).select(F.col('tailnum').alias('tail_number'),F.col('manufacturer').alias('manufacturer'),F.col('model').alias('model')).na.drop(subset=["model", "manufacturer"])
  
  aircrafts_clean = aircrafts_clean.filter(aircrafts_clean.manufacturer == "CESSNA")

  
  
  flights_df = sqlContext.read.csv(flights_path, header = True).select((F.col(' tail_number')).alias('tail_number'),F.col(' flight_number').alias('flight_number')).na.drop(subset=["flight_number", "tail_number"])
  
  


  model_flight = broadcast(aircrafts_clean).join(flights_df, 'tail_number')

  model_flight_count = model_flight.groupBy('tail_number')\
                                    .count() \
                                     #.orderBy("count" , ascending = False)

  full_model = model_flight_count.join(aircrafts_clean, 'tail_number','left_outer')

  top3 = full_model.withColumn("new_model", regexp_extract(full_model.model, '\d{3}', 0))
  top3 = top3.orderBy('count', ascending = False)


  top3_combined = top3.withColumn('Flights', 
                      sf.concat(sf.col('manufacturer'),sf.lit(' '), sf.col('new_model')))

  final = top3_combined.select(initcap("Flights")).limit(3)
  display(final)


pass




In [None]:
task_1(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_{size}.csv", f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv")