# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import folium
import matplotlib as mpl

# Load dataset

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_brut = pd.read_csv('../input/flights-in-brazil-compilation-jun2019-jun2021/raw_2021.csv')
airports_code_brut = pd.read_csv('/kaggle/input/air-codeg/airport-codes.csv', engine='python')

In [None]:
df_brut.head(2)

In [None]:
airports_code_brut.head(2)

# Clean datasets

## 0) quick view

In [None]:
df_brut

In [None]:
df_brut['line_type'].hist();

In [None]:
temp_df = df_brut.copy()
temp_airports_code = airports_code_brut.copy()

## 1) Drop useless columns

In [None]:
non_necessary_df = ['auth_code', 'scheduled_dep', 'real_dep', 'jus_code']
temp_df = temp_df.drop(labels=non_necessary_df, axis=1)

## 2) define columns type

### a) Datetime col

In [None]:
date_cols = ['scheduled_arr', 'real_arr']

for d in date_cols:
    temp_df[d] = pd.to_datetime(temp_df[d])
    
temp_df.head(2)

## 3) Delete some rows

### a) delete 'non realizado' flights

In [None]:
def n_rows_deleted(df_init, df_final):
    n_rows = abs(df_init.shape[0] - df_final.shape[0])
    print("{} rows have been deleted".format(n_rows))

In [None]:
temp_df = temp_df.loc[temp_df['situation'] == 'REALIZADO']
n_rows_deleted(df_brut, temp_df)

### b) keep N and I flights

In [None]:
df_N = temp_df.loc[(temp_df['line_type'] == 'N')]
df_I = temp_df.loc[(temp_df['line_type'] == 'I')]

frames = [df_I, df_N]
temp_df = pd.concat(frames)

## 4) Delete outrange date 

In [None]:
def looking_for_dt(df):
    tmin = df['real_arr'].min()
    tmax = df['real_arr'].max()
    print("t min = {} \nt max = {}".format(tmin, tmax))

    dt = tmax - tmin
    print("dt = {}".format(dt))
    
    return tmin, tmax, dt

In [None]:
looking_for_dt(temp_df)

In [None]:
split_date = datetime.datetime(2021,8,1)
temp_df = temp_df.loc[temp_df['real_arr'] < split_date]

In [None]:
looking_for_dt(temp_df)

In [None]:
n_rows_deleted(df_brut, temp_df)

## 5) Create new columns

### a) Year

In [None]:
temp_df['year'] = temp_df['real_arr'].dt.year

### b) Month

In [None]:
temp_df['month'] = temp_df['real_arr'].dt.month

### c) Hour

In [None]:
temp_df['hour'] = temp_df['real_arr'].dt.hour

### d) dt_arr_min

In [None]:
temp_df['dt_arr'] = (temp_df['scheduled_arr'] - temp_df['real_arr'])
temp_df['dt_arr_min'] = temp_df['dt_arr'] / np.timedelta64(1, 'm')
temp_df['delay'] = temp_df['dt_arr_min'].apply(lambda x : 'Late' if x<0 else 'Non Late')

### e) airports code : lat & long

In [None]:
temp_airports_code['long'] = temp_airports_code['coordinates'].str.split(',', expand=True)[0]
temp_airports_code['lat'] = temp_airports_code['coordinates'].str.split(',', expand=True)[1]

## 6) NaN values ?

In [None]:
temp_df.isna().sum()

In [None]:
temp_df = temp_df.dropna()
temp_df.isna().sum()

## 7) prepare df for exploration

In [None]:
df = temp_df.copy()
airports_code = temp_airports_code.copy()
df_delay = df.loc[df['delay'] == 'Late']

# Exploration : The Delay

This part is focus on delayed flight. I'm trying to highlight some informations about delay by airports, companies, etc etc

## 1) Percentage of delayed flight

In [None]:
delayed_flight = df_delay.shape[0]
total_flight = df.shape[0]
p_delayed_flight = delayed_flight/total_flight

print('percentage of delayed flight = {:.0%}'.format(p_delayed_flight))

In [None]:
ax = sns.countplot(x="delay", data=df)
ax.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))

## 2) Statistics

### a) global

In [None]:
df_delay.boxplot(showfliers=False, column=['dt_arr_min']);

In [None]:
s_delay = df_delay['dt_arr_min']
s_delay.hist();

In [None]:
s_delay.hist(range=[-180,0])

### b) drop delay longer than 2 hours

In [None]:
df_delay_120 = df_delay[df_delay['dt_arr_min'] > -120]
p = df_delay_120.shape[0] / df_delay.shape[0]
print("{:.2%} of delayed flights are conserved".format(p))

## 3) Delay by origin

In [None]:
def count_sum_by(df, criteria, colmn):
    s_sum = df[criteria].groupby(df[colmn]).sum()
    s_count = df[criteria].groupby(df[colmn]).count()
    s_sum_count = s_sum/s_count
    
    frame = {
        'sum_delay' : s_sum,
        'count_delay' : s_count,
        'ratio_sum_count' : s_sum_count 
    }
    
    df_count_sum = pd.DataFrame(frame)
    
    return df_count_sum

In [None]:
df_delay_origin = count_sum_by(df_delay_120, 'dt_arr_min', 'origin')

In [None]:
n = 20
df_delay_origin = df_delay_origin.sort_values(by=['count_delay'], ascending=False)
worst_10_origin = df_delay_origin[:n]

### map

#### add lat and long

In [None]:
# Extract lat and long
cols = ['ident', 'name', 'long', 'lat']
t_df = pd.DataFrame()

for idx, airport in enumerate(worst_10_origin.index):
    r = airports_code.loc[airports_code['ident']==airport]
    t_df = t_df.append(r[cols])

In [None]:
worst_10_origin = worst_10_origin.join(t_df.set_index('ident'))

#### show

In [None]:
colors_grad = ['red',
 '#ff0d00',
 '#ff1b00',
 '#ff2800',
 '#ff3600',
 '#ff4300',
 '#ff5100',
 '#ff5e00',
 '#ff6b00',
 '#ff7900',
 '#ff8600',
 '#ff9400',
 '#ffa100',
 '#ffae00',
 '#ffbc00',
 '#ffc900',
 'gold',
 '#ffe400',
 '#fff200',
 'yellow']

In [None]:
def map_airports(df, colors):
    
    m = folium.Map(location=(-12.109923,-57.8987643), zoom_start=4.32)

    for i,a in enumerate(df.index):
        r = df.iloc[i]
        lat = r['lat']
        long = r['long']
        lbl = "{:.0f} min".format(r['ratio_sum_count'])
        c = colors[i]

        folium.Marker(
        location=[lat, long],
        popup=lbl,
        icon=folium.Icon(color="white" ,icon_color=c, icon="info-sign"),
        ).add_to(m)
        
    return m

In [None]:
worst_10_origin = worst_10_origin.sort_values(by=['ratio_sum_count'], ascending=False)
map_airports(worst_10_origin, colors_grad)

## 4) Delay by destination

In [None]:
df_delay_dest = count_sum_by(df_delay_120, 'dt_arr_min', 'destination')
n = 20
df_delay_dest = df_delay_dest.sort_values(by=['count_delay'], ascending=False)
worst_10_dest = df_delay_dest[:n]

In [None]:
cols = ['ident', 'name', 'long', 'lat']
t_df = pd.DataFrame()

for idx, airport in enumerate(worst_10_dest.index):
    r = airports_code.loc[airports_code['ident']==airport]
    t_df = t_df.append(r[cols])

worst_10_dest = worst_10_dest.join(t_df.set_index('ident'))

In [None]:
worst_10_dest = worst_10_dest.sort_values(by=['ratio_sum_count'], ascending=False)
map_airports(worst_10_dest, colors_grad)

## 5) Delay by airports

In [None]:
#df_airports = df_destination.add(df_origin, fill_value=0)
df_delay_airports = df_delay_dest.add(df_delay_origin, fill_value=0)
df_delay_airports['ratio_sum_count'] = df_delay_airports['sum_delay'] / df_delay_airports['count_delay']
df_delay_airports = df_delay_airports.sort_values(by='count_delay', ascending=False)
n=20
worst_10_airports = df_delay_airports.iloc[:n]

In [None]:
cols = ['ident', 'name', 'long', 'lat']
t_df = pd.DataFrame()

for idx, airport in enumerate(worst_10_airports.index):
    r = airports_code.loc[airports_code['ident']==airport]
    t_df = t_df.append(r[cols])

worst_10_airports = worst_10_airports.join(t_df.set_index('ident'))

In [None]:
worst_10_airports = worst_10_airports.sort_values(by=['ratio_sum_count'], ascending=False)
map_airports(worst_10_airports, colors_grad)