# VisionAirport

## Imports + Init

In [22]:
import boto3
from pyspark import SparkContext
import numpy as np
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import col, desc, asc, lit
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc
import folium

In [23]:
try:
    sc = SparkContext("local").getOrCreate()
except:
    print("SC already exists")
sqlContext = SQLContext(sc)
spark = SparkSession(sc)
plt.style.use('ggplot')
DATADIR = "./data"

SC already exists




## Airport

In [24]:
# AIRPORT DATAFRAME

airport_schema = StructType([
    StructField('airport', StringType(), True),
    StructField('city', StringType(), True),
    StructField('country', StringType(), True),
    StructField('IATA', StringType(), True),
    StructField('ICAO', StringType(), True),
    StructField('lat', FloatType(), True),
    StructField('lon', FloatType(), True),
    StructField('alt', StringType(), True),
    StructField('TZ', FloatType(), True),
    StructField('dst', StringType(), True),
    StructField('tz2', StringType(), True),
])


airport_df = spark.read.csv(
    "./data/export_luchthavens.txt", 
    # header=True,
    sep="\t",
    multiLine=True,
    schema=airport_schema
)

try:
    airport_df.write.parquet("aws/airport.parquet")
except:
    print("")
airport_df.show(3, False)


+--------------+------------+-----------+----+----+---------+---------+----+----+---+----------+
|airport       |city        |country    |IATA|ICAO|lat      |lon      |alt |TZ  |dst|tz2       |
+--------------+------------+-----------+----+----+---------+---------+----+----+---+----------+
|Airport       |City        |Country    |IATA|ICAO|null     |null     |Alt |null|DST|Tz        |
|Bamyan Airport|Bamyan      |Afghanistan|BIN |OABN|34.816666|67.816666|2550|4.5 |N  |Asia/Kabul|
|Camp Bastion  |Camp Bastion|Afghanistan|null|OAZI|31.865557|64.195274|2808|4.5 |N  |Asia/Kabul|
+--------------+------------+-----------+----+----+---------+---------+----+----+---+----------+
only showing top 3 rows



## Weather

In [26]:
# WEATHER DATAFRAME

weather_schema = StructType([
    StructField('date', DateType(), True),
    StructField('DDVEC', IntegerType(), True),
    StructField('FHVEC', IntegerType(), True),
    StructField('FG', IntegerType(), True),
    StructField('FHX', IntegerType(), True),
    StructField('FHXH', IntegerType(), True),
    StructField('FHN', IntegerType(), True),
    StructField('FHNH', IntegerType(), True),
    StructField('FXX', IntegerType(), True),
    StructField('FXXH', IntegerType(), True),
    StructField('TG', IntegerType(), True),
    StructField('TN', IntegerType(), True),
    StructField('TNH', IntegerType(), True),
    StructField('TX', IntegerType(), True),
    StructField('TXH', IntegerType(), True),
    StructField('T10N', IntegerType(), True),
    StructField('T10NH', IntegerType(), True),
    StructField('SQ', IntegerType(), True),
    StructField('Q', IntegerType(), True),
    StructField('DR', IntegerType(), True),
    StructField('RH', IntegerType(), True),
    StructField('RHX', IntegerType(), True),
    StructField('RHXH', IntegerType(), True),
    StructField('PG', IntegerType(), True),
    StructField('PX', IntegerType(), True),
    StructField('PXH', IntegerType(), True),
    StructField('PN', IntegerType(), True),
    StructField('PNH', IntegerType(), True),
    StructField('VVN', IntegerType(), True),
    StructField('VVNH', IntegerType(), True),
    StructField('VVX', IntegerType(), True),
    StructField('VVXH', IntegerType(), True),
    StructField('NG', IntegerType(), True),
    StructField('UG', IntegerType(), True),
    StructField('UX', IntegerType(), True),
    StructField('UXH', IntegerType(), True),
    StructField('UN', IntegerType(), True),
    StructField('UNH', IntegerType(), True),
    StructField('EV2', IntegerType(), True),
])

weather_df = spark.read.csv(
    "./data/export_weer.txt", 
    header=True,
    sep="\t",
    multiLine=True,
    schema=weather_schema
)

try:
    weather_df.write.parquet("aws/weather.parquet")
except:
    print("")
weather_df.show(3, False)

IndentationError: expected an indented block (2640438435.py, line 55)

## Customers

In [None]:
# CUSTOMERS DATAFRAME

customers_schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('operation', FloatType(), True),
    StructField('facilities', FloatType(), True),
    StructField('shops', FloatType(), True),
])


customers_df = spark.read.csv(
    "./data/export_klant.csv", 
    header=True,
    sep=";",
    multiLine=True,
    schema=customers_schema
)

try:
    customers_df.write.parquet("aws/customers.parquet")
except:
    print("")
customers_df.show(3, False)

## Vertrek

In [None]:
vertrek_schema = StructType([
    StructField("Vluchtid", IntegerType(), False),
    StructField("Vliegtuigcode",StringType(),True),
    StructField("Terminal", StringType(),True),
    StructField("Gate", StringType(), True),
    StructField("Baan", ShortType(), True),
    StructField("Bezetting", IntegerType(), True),
    StructField("Vracht", IntegerType(), True),
    StructField("Vertrektijd", TimestampType(), True)
  ])

vertrek_df = spark.read.csv(
    DATADIR + "/export_vertrek.txt", 
    header=True,
    sep='\t',
    schema=vertrek_schema
)
try:
    vertrek_df.write.parquet("aws/vertrek.parquet")
except:
    print("")

vertrek_df.show(3, False)

## Aankomst

In [None]:
aankomst_schema = StructType([
    StructField("Vluchtid", IntegerType(), False),
    StructField("Vliegtuigcode",StringType(),True),
    StructField("Terminal", StringType(), True),
    StructField("Gate", StringType(), True),
    StructField("Baan", ShortType(), True),
    StructField("Bezetting", IntegerType(), True),
    StructField("Vracht", IntegerType(), True),
    StructField("Aankomsttijd", TimestampType(), True)
  ])

aankomst_df = spark.read.csv(
    DATADIR + "/export_aankomst.txt", 
    header=True,
    sep='\t',
    schema=aankomst_schema
)

try:
    aankomst_df.write.parquet("aws/aankomst.parquet")
except:
    print("")
aankomst_df.show(3, False)

## Planning

In [None]:
planning_schema = StructType([
    StructField("Vluchtnr", IntegerType(), False),
    StructField("Airlinecode", StringType(), True),
    StructField("Destcode", StringType(), True),
    StructField("Planterminal", StringType(), True),
    StructField("Plangate", StringType(), True),
    StructField("Plantijd", StringType(), True)
  ])

planning_df = spark.read.csv(
    DATADIR + "/export_planning.txt", 
    header=True,
    sep='\t',
    schema=planning_schema
)
try:
    planning_df.write.parquet("aws/planning.parquet")
except:
    print("")
planning_df.show(3, False)

# Analysis

## Airport

In [None]:
location_df = airport_df.dropna().collect()

locations = list(map(lambda r : [r['airport'], r['city'],(r['lat'], r['lon'])], location_df)) 
map_tweets = folium.Map(location=[65,26], zoom_start=4)

for location_airport, location_city, location_coords in locations:
    folium.Circle(location=location_coords,
                  popup = f"{location_city}: {location_airport}",
                  radius = 1000,
                  color="crimson",
                  fill_color="crimson",
                  tooltip=location_airport
                  ).add_to(map_tweets)
map_tweets

## Customers

In [None]:
operation_reviews = customers_df. groupby("operation").count().dropna().sort(asc("operation")).collect()
facilities_reviews = customers_df. groupby("facilities").count().dropna().sort(asc("facilities")).collect()
shops_reviews = customers_df. groupby("shops").count().dropna().sort(asc("shops")).collect()

operation_y = list(map(lambda r : r['count'], operation_reviews))
facilities_y = list(map(lambda r : r['count'], facilities_reviews))
shops_y = list(map(lambda r : r['count'], shops_reviews))

operation_x = list(map(lambda r : (r['operation']), operation_reviews)) 
facilities_x = list(map(lambda r : (r['facilities']), facilities_reviews)) 
shops_x = list(map(lambda r : (r['shops']), shops_reviews)) 


In [None]:
print(f"Mean: {np.mean(operation_x)}")
fig, ax = plt.subplots(figsize=(16,8))
ax.plot(operation_x, operation_y)
ax.tick_params(axis='y')
ax.set_xlabel('Rating')
ax.set_ylabel('Frequency')
ax.legend()
plt.title("Frequency of operation ratings")
plt.show()

In [None]:
print(f"Mean: {np.mean(facilities_x)}")
fig, ax = plt.subplots(figsize=(16,8))
ax.plot(facilities_x, facilities_y)
ax.tick_params(axis='y')
ax.set_xlabel('Rating')
ax.set_ylabel('Frequency')
ax.legend()
plt.title("Frequency of facility ratings")
plt.show()

In [None]:
print(f"Mean: {np.mean(shops_x)}")
fig, ax = plt.subplots(figsize=(16,8))
ax.plot(shops_x, shops_y)
ax.tick_params(axis='y')
ax.set_xlabel('Rating')
ax.set_ylabel('Frequency')
ax.legend()
plt.title("Frequency of shop ratings")
plt.show()