# Data Wrangling of the dataframe flight analysis 2009-2015

#### Important Libraries Loading in EMR Cluster 

In [0]:
sc.install_pypi_package("pandas==0.25.1") #Install pandas version 0.25.1 
sc.install_pypi_package("matplotlib", "https://pypi.org/simple") #Install matplotlib from given PyPI repository

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import * 
import pandas as pd 

#### Data Loading

In [0]:
#Read from filepath on distributed system 
path = "s3://flightanalysis/cleaned-dataset/flight-profile/flight-info.csv"
profile_df = spark.read.options(header=True, inferSchema=True).csv(path)
profile_df.dropna().show(2)

#### Exploratory Data Analysis

In [0]:
#Statistical Analysis
num_of_flights = profile_df.select('flight_number').distinct().count()
print(num_of_flights) #7606 of unique type of flights 
profile_df.describe().show(2)

In [0]:
calendar = profile_df.withColumn('date', year(profile_df.date)).na.drop()
calendar = (calendar.withColumnRenamed('date', 'year'))
calendar.show(2)

The graph shows that the number of flights across the years had little variation; with a peak in the year xxxx. 

In [0]:
%%local 
%matplotlib inline
import matplotlib.pyplot as plt

plt.clf()
#Number of flights by year 
num_of_flights_by_year = calendar.groupBy('year').count().orderBy('year').toPandas()
num_of_flights_by_year.plot(kind='area', x='year',y='count', rot=70, color='#bc5090', legend=None, figsize=(8,6))

plt.xticks(num_of_flights_by_year.year)
plt.xlim(2009, 2015)
plt.title('Number of flights across years')
plt.xlabel('Year')
plt.ylabel('Number of flights')

In [0]:
#Top 10 airline operator 
top_10_airlines = calendar.groupBy('flight_identifier').count().orderBy('count', ascending=False).limit(10)
top_10_airlines.show(10, False)