In [1]:
from pyspark.sql.functions import *
from pyspark.sql.functions import col, expr, lit
from pyspark.sql.types import *
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
spark = SparkSession.builder.appName("Airport_Weather_prep_xiomara_1").getOrCreate() 
spark.sparkContext

23/11/21 22:49:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [26]:
df_ICAO = spark.read.csv("gs://msca-bdp-student-gcs/Group7_Final_Project/flight_status/airport_weather/iata-icao.csv", 
                                     header = True, inferSchema = True)

In [27]:
df_ICAO.show()

+------------+---------------+----+----+--------------------+--------+---------+
|country_code|    region_name|iata|icao|             airport|latitude|longitude|
+------------+---------------+----+----+--------------------+--------+---------+
|          AE|   Ash Shariqah| SHJ|OMSJ|Sharjah Internati...| 25.3286|  55.5172|
|          AE|       Abu Zaby| AZI|OMAD|Al Bateen Executi...| 24.4283|  54.4581|
|          AE|    Al Fujayrah| FJR|OMFJ|Fujairah Internat...| 25.1122|   56.324|
|          AE|       Abu Zaby| XSB|OMBY|Sir Bani Yas Airport| 24.2836|  52.5803|
|          AE|Ra's al Khaymah| RKT|OMRK|Ras Al Khaimah In...| 25.6135|  55.9388|
|          AE|         Dubayy| DWC|OMDW|Al Maktoum Intern...| 24.8964|  55.1614|
|          AE|       Abu Zaby| DHF|OMAM|  Al Dhafra Air Base| 24.2482|  54.5477|
|          AE|       Abu Zaby| ZDY|OMDL|       Dalma Airport|   24.51|  52.3352|
|          AE|         Dubayy| DCG|null|Dubai Creek Seapl...| 25.2422|  55.3314|
|          AE|       Abu Zab

In [28]:
df_ICAO_US = df_ICAO.filter("country_code == 'US'")

In [29]:
df_clean = df_ICAO_US.dropna()

num_rows_before = df_ICAO_US.count()
num_rows_after = df_clean.count()
diff = num_rows_before - num_rows_after

print(f"Number of rows before dropping empty rows: {num_rows_before}")
print(f"Number of rows after dropping empty rows: {num_rows_after}")

if num_rows_before != num_rows_after:
    print(f"There were {diff} empty rows in the DataFrame.")
else:
    print("No empty rows in the DataFrame.")

Number of rows before dropping empty rows: 1994
Number of rows after dropping empty rows: 1706
There were 288 empty rows in the DataFrame.


In [30]:
df_ICAO_US.filter("ICAO is NULL").show()

+------------+-------------+----+----+--------------------+--------+---------+
|country_code|  region_name|iata|icao|             airport|latitude|longitude|
+------------+-------------+----+----+--------------------+--------+---------+
|          US|        Texas| WIB|null|Wilbarger County ...| 34.2257| -99.2838|
|          US|       Alaska| HGZ|null|   Hog River Airport| 66.2161| -155.669|
|          US|   California| FOB|null|  Fort Bragg Airport| 39.4743| -123.796|
|          US|       Alaska| HED|null|Herendeen Bay Air...| 55.8014| -160.899|
|          US|     New York| HCC|null|Columbia County A...| 42.2913| -73.7103|
|          US|      Arizona| HBK|null|Holbrook Municipa...| 34.9407| -110.138|
|          US|       Alaska| HBH|null|Entrance Island S...| 57.4122| -133.438|
|          US|   New Mexico| HBB|null|Hobbs Industrial ...| 32.7668| -103.209|
|          US|       Alaska| SOL|null|Solomon State Fie...| 64.5605| -164.446|
|          US|West Virginia| GWV|null|Glendale Fokke

In [35]:
df_ICAO_US = df_ICAO_US.drop("latitude", "longitude", "country_code")

In [40]:
df_ICAO_US.printSchema()

root
 |-- region_name: string (nullable = true)
 |-- iata: string (nullable = true)
 |-- icao: string (nullable = true)
 |-- airport: string (nullable = true)



In [36]:
df_weather = spark.read.csv("gs://msca-bdp-student-gcs/Group7_Final_Project/flight_status/airport_weather/WeatherEvents_Jan2016-Dec2022.csv", 
                                     header = True, inferSchema = True)

                                                                                

In [37]:
df_clean = df_weather.dropna()

num_rows_before = df_weather.count()
num_rows_after = df_clean.count()
diff = num_rows_before - num_rows_after
# print(diff)

print(f"Number of rows before dropping empty rows: {num_rows_before}")
print(f"Number of rows after dropping empty rows: {num_rows_after}")

if num_rows_before != num_rows_after:
    print(f"There were {diff} empty rows in the DataFrame.")
else:
    print("No empty rows in the DataFrame.")



Number of rows before dropping empty rows: 8627181
Number of rows after dropping empty rows: 8557982
There were 69199 empty rows in the DataFrame.


                                                                                

In [42]:
weather_df = df_weather.join(df_ICAO_US, df_weather.AirportCode == df_ICAO_US.icao)

In [44]:
weather_df = weather_df.drop("LocationLat", "LocationLng")

In [45]:
weather_df = weather_df.withColumnRenamed('EventId', 'event_id')\
                .withColumnRenamed('Type', 'type')\
                .withColumnRenamed('Severity', 'severity')\
                .withColumnRenamed('StartTime(UTC)', 'start_time_utc')\
                .withColumnRenamed('EndTime(UTC)', 'end_time_utc')\
                .withColumnRenamed('Precipitation(in)', 'precipitation_inches')\
                .withColumnRenamed('TimeZone', 'timezone')\
                .withColumnRenamed('AirportCode', 'icao_aiport_code')\
                .withColumnRenamed('City', 'city')\
                .withColumnRenamed('County', 'county')\
                .withColumnRenamed('State', 'state')\
                .withColumnRenamed('ZipCode', 'zip_code')

In [46]:
weather_df.printSchema()

root
 |-- event_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- severity: string (nullable = true)
 |-- start_time_utc: timestamp (nullable = true)
 |-- end_time_utc: timestamp (nullable = true)
 |-- precipitation_inches: double (nullable = true)
 |-- timezone: string (nullable = true)
 |-- icao_aiport_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip_code: integer (nullable = true)
 |-- region_name: string (nullable = true)
 |-- iata: string (nullable = true)
 |-- icao: string (nullable = true)
 |-- airport: string (nullable = true)



In [49]:
weather_df.select("type", "severity", "precipitation_inches", "icao_aiport_code", "start_time_utc").show()

+----+--------+--------------------+----------------+-------------------+
|type|severity|precipitation_inches|icao_aiport_code|     start_time_utc|
+----+--------+--------------------+----------------+-------------------+
|Rain|   Light|                 0.0|            KAXS|2016-01-05 21:45:00|
|Rain|   Light|                0.03|            KAXS|2016-01-05 22:53:00|
|Rain|   Light|                 0.0|            KAXS|2016-01-06 00:10:00|
|Rain|   Light|                 0.0|            KAXS|2016-01-06 20:37:00|
|Rain|   Light|                0.03|            KAXS|2016-01-06 21:07:00|
|Rain|Moderate|                0.11|            KAXS|2016-01-06 22:58:00|
|Rain|   Light|                 0.0|            KAXS|2016-01-06 23:03:00|
|Rain|   Light|                 0.0|            KAXS|2016-01-06 23:36:00|
|Rain|   Light|                 0.0|            KAXS|2016-01-07 00:01:00|
|Rain|   Light|                 0.0|            KAXS|2016-01-07 00:37:00|
|Rain|   Light|                 0.0|  

In [50]:
weather_df.write\
.mode('overwrite')\
.option('header', 'true')\
.csv("gs://msca-bdp-student-gcs/Group7_Final_Project/flight_status/airport_weather/airport_weather_preprocessed.csv")

                                                                                