# Flight data in Brazil - Visualization

## Import libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import folium
from folium.plugins import FastMarkerCluster

In [None]:
# Load csv file into DataFrame
flights = pd.read_csv('../input/flights-in-brazil-compilation-jun2019-jun2021/raw_2021.csv', encoding='utf-8', low_memory=False)
flights

In [None]:
# Load csv file into DataFrame
airports = pd.read_csv('../input/airportcodes/airport-codes_csv.csv', engine='python', encoding='utf-8')
airports

## Data Preprocessing

### Flights

In [None]:
# Check null values by columns
flights.isnull().sum()

In [None]:
# Drop unnecesary columns
flights.drop(['auth_code', 'scheduled_dep', 'real_dep', 'scheduled_arr', 'situation', 'jus_code'], axis=1, inplace=True)
flights

In [None]:
# Remove rows which has nulll values
flights.dropna(axis=0, inplace=True)
flights

In [None]:
# Re-check DataFRame whether it contains null rows or not
flights.isnull().sum()

### Airports

In [None]:
airports

In [None]:
# Check null values by columns
airports.isnull().sum()

In [None]:
# Split coordinates column into latitude and longtitude
airports['long'] = airports['coordinates'].str.split(',', expand=True)[0]
airports['lat'] = airports['coordinates'].str.split(',', expand=True)[1]
airports

In [None]:
# Drop unnecessary columns
airports.drop(['type', 'elevation_ft', 'continent', 'iso_country', 'iso_region', 'municipality', 'gps_code', 'iata_code', 'local_code', 'coordinates'], axis=1, inplace=True)
airports

In [None]:
# Relocate columns and rename 'ident'
airports.rename(columns={'ident': 'ICAO'}, inplace=True)
airports = airports[['ICAO', 'name', 'lat', 'long']]
airports

In [None]:
# Re-check DataFRame whether it contains null rows or not
airports.isnull().sum()

## Data Preparing

In [None]:
# Check unique values of each column
for i in range(len(flights.columns)):
    column_list = flights.columns.tolist()
    print(column_list[i])
    print(flights.iloc[:, i].unique())

In [None]:
# Copy flights for backup
flights_copy = flights.copy()
flights_copy

In [None]:
# I have no idea with line_type of following values: C, G, L, R, E, H and X
# Therefore, I decided to remove rows which contain those
target = ['N', 'I']
flights = flights_copy.loc[flights['line_type'].isin(target)]
flights

In [None]:
# Check unique values of each column
for i in range(len(flights.columns)):
    column_list = flights.columns.tolist()
    print(column_list[i])
    print(flights.iloc[:, i].unique())

In [None]:
# Split Arrival(Scheduled, Real) into Year, Month and Date
flights['real_arr'] = pd.to_datetime(flights['real_arr'])
flights.dtypes

In [None]:
# Extract values of year, month and day by using attribute of dt
flights['real_arr_Year'] = flights['real_arr'].dt.year
flights['real_arr_Month'] = flights['real_arr'].dt.month
flights['real_arr_Day'] = flights['real_arr'].dt.day

flights

## Data Reshaping

### Departure

In [None]:
# Merge two DataFrames: flights, airports
# Purpose: append lat, long column on flights
aviation_origin = pd.merge(flights, airports, left_on='origin', right_on='ICAO', how='outer')
aviation_origin.rename(columns={'lat': 'origin_lat', 'long': 'origin_long'}, inplace=True)
aviation_origin

In [None]:
# Check how many values are NaN
aviation_origin.isnull().sum()

In [None]:
# Drop NaN rows
aviation_origin.dropna(inplace=True)
aviation_origin

In [None]:
# Extract origin coordinates column from 'aviation'
origin_coordinates = aviation_origin[['origin_lat', 'origin_long']]
origin_coordinates

### Arrival

In [None]:
aviation_destination = pd.merge(flights, airports, left_on='destination', right_on='ICAO', how='outer')
aviation_destination.rename(columns={'lat': 'destination_lat', 'long': 'destination_long'}, inplace=True)
aviation_destination

In [None]:
# Check how many values are NaN
aviation_destination.isnull().sum()

In [None]:
# Drop NaN rows
aviation_destination.dropna(inplace=True)
aviation_destination

In [None]:
# Extract destination coordinates column from 'aviation'
destination_coordinates = aviation_destination[['destination_lat', 'destination_long']]
destination_coordinates

## Visualiation

### Matplotlib

In [None]:
aviation_origin_count = aviation_origin.groupby(by=['company'], as_index=False).count()
aviation_origin_count

In [None]:
# Create Bar Plot
plt.bar(aviation_origin_count['company'], aviation_origin_count['flight'])

plt.xlabel('company')
plt.xticks(rotation=90)
plt.ylabel('flight')
plt.title('How many flights does each company have')
plt.savefig('Figure1.png', overwrite=True)

plt.show()

In [None]:
# Check Top 3 companies
aviation_origin_top = aviation_origin_count.sort_values(by=['flight'], ascending=False, ignore_index=True)
aviation_origin_top

In [None]:
# Drop rows of companies which is with Top 3 in order to compare well
aviation_origin_top.drop(aviation_origin_top.index[0:3], axis=0, inplace=True)
aviation_origin_top.reset_index(drop=True, inplace=True)
aviation_origin_top

In [None]:
# Create Bar Plot
plt.bar(aviation_origin_top['company'], aviation_origin_top['flight'])

plt.xlabel('company')
plt.xticks(rotation=90)
plt.ylabel('flight')
plt.title('How many flights does each company have (Except Top 3)')
plt.savefig('Figure2.png', overwrite=True)

plt.show()

### Map

In [None]:
# Create Map object with folium
map = folium.Map(location=(-12.109923,-57.8987643), zoom_start=5.32)
map

In [None]:
# FastMarkerCluster

FastMarkerCluster(data=aviation_origin[['origin_lat', 'origin_long']]).add_to(map)
folium.LayerControl().add_to(map)

map