# Tasks
- Drop a variable with too many null values
- Replace null values of the variables we would like to add on the ML model
- Minimize the dimensions of the variables


In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

%matplotlib inline

# Load the data: 

In [None]:
airports_df = pd.read_csv('../2008.csv')
airports_df

# Drop the variables that we don't need: 

In [None]:
airports_df = airports_df.drop(["ArrTime", "ActualElapsedTime", "AirTime", "TaxiIn", "Diverted", "CarrierDelay",
                         "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay"], axis=1)
airports_df

# 1) Feature Engineering
## 1. Handling missing values
## 1.1. Locating missing values

In [None]:
airports_df.isnull().any()

In [None]:
airports_df.isnull().sum() 

## 1.2. Dropping the `CancellationCode` variable

In [None]:
threshold = 0.9
columns_with_null = airports_df.columns[airports_df.isnull().mean() > threshold] 
rows_with_null = airports_df.loc[airports_df.isnull().mean(axis=1) > threshold] 

In [None]:
columns_with_null

In [None]:
rows_with_null

<span style="color:blue"> <b> Drop `CancellationCode` due to its too many null data over the threshold (0.9) </b> </span>.

In [None]:
airports_df.drop("CancellationCode", axis=1, inplace=True)

## 1.3. Dropping the rows where the `cancelled` is `True`

Checking the relation among `DepTime`, `DepDelay`,  `TaxiOut`

In [None]:
cancelled_flights = airports_df[airports_df['Cancelled']==1]
cancelled_flights.shape[0]

In [None]:
cancelled_flights.isnull().sum() / airports_df.isnull().sum() 

Therefore, all the null values in `DepTime`, `DepDelay`,  `TaxiOut` are in cancelled flights (`cancelled` == 1). <br>
Most(over 90%) of the null values in `TailNum`, `ArrDelay`, `ResDelay` are in cacnelled flights (`cancelled` == 1).

In [None]:
cancelled_flights[cancelled_flights['ArrDelay'].notnull()].empty

No cancelled flights have `ArrDelay` variable (target variable); therefore, cancelled flights should be dropped. <br>
<span style="color:blue"> <b> Drop rows where `Cancellation` is `True`</b> </span>.

In [None]:
airports_df = airports_df[airports_df['Cancelled'] == 0]

<span style="color:blue"> <b> Drop the variable `Cancellation`, because all the values are `False` </b> </span>.

In [None]:
airports_df.pop('Cancelled', axis=1)

## 1.4. Dropping the rows where the `ArrDelay` is null

Since `ArrDelay` is our target variable, rows with null values in `ArrDelay` should be dropped.

In [None]:
airports_df = airports_df.dropna(subset=['ArrDelay'])

In [None]:
airports_df.isnull().sum()

<span style="color:blue"> <b> In conclusion, we dropped a variable `CancellationCode` and rows with null value in `Cancellation`, `ArrDelay`.</b> </span>. 

In [None]:
airports_df.shape

## 1.2. Treat categorical variables: 

## 1.2.1. Converting time-relevant columns into one `datetime` field

Though `DepTime`, `CRSDepTime`, `CRSArrTime` looks numerical, they are categorical values in a format of 'HH:mm'

In [None]:
def merge_datetime_columns(df):
    if 'DepTS' in df and 'CSRDepTS' in df:
        return df

    temp_ts = df[["Year", "Month", "DayofMonth"]].astype(str).copy()
    
    # Actual departure time
    temp_ts['Hour'] = (df["DepTime"] // 100).astype(int).astype(str)
    temp_ts['Minute'] = (df["DepTime"] % 100).astype(int).astype(str)
    temp_ts['Time'] = temp_ts['Year'] + '-' + temp_ts['Month'] + '-' + temp_ts['DayofMonth'] + ' ' \
                    + temp_ts['Hour'] + ':' + temp_ts['Minute']
    temp_ts['DepTS'] = pd.to_datetime(temp_ts['Time'], format='%Y-%m-%d %H:%M', errors='coerce')

    # Scheduled departure time
    temp_ts['CSRDepHour'] =  (df["CRSDepTime"] // 100).astype(int).astype(str)
    temp_ts['CSRDepMinute'] =  (df["CRSDepTime"] % 100).astype(int).astype(str)
    temp_ts['CSRTime'] = temp_ts['Year'] + '-' + temp_ts['Month'] + '-' + temp_ts['DayofMonth'] + ' ' + \
                         temp_ts['CSRDepHour'] + ':' + temp_ts['CSRDepMinute']
    temp_ts['CSRDepTS'] = pd.to_datetime(temp_ts['CSRTime'], format='%Y-%m-%d %H:%M', errors='coerce')
    
    # Scheduled arrival time
    
    df['DepTS'] = temp_ts['DepTS']
    df['CSRDepTS'] = temp_ts['CSRDepTS']
    return df

In [None]:
airports_df = merge_datetime_columns(airports_df)
airports_df

<span style="color:red"> <b> Arrival date? </b> </span>

In [None]:
airports_df[airports_df['CRSArrTime'] < airports_df['CRSDepTime']]

In [None]:
# Split the dataframe: 
airports_df.dtypes

# List categorical variables:
cat_mask = (airports_df.dtypes==object)
cat_cols = airports_df.columns[cat_mask].tolist()
df_cat = airports_df[cat_cols]

df_num = airports_df.drop(cat_cols, axis=1)
# categorical columns = df_cat
# numerical columns = df_num

df_cat

In [None]:
df_num

In [None]:
# impute column "CancellationCode" with variables "UniqueCarrier"
from sklearn.impute import SimpleImputer

imp_cat = SimpleImputer(strategy='most_frequent')

df_cat = pd.DataFrame(imp_cat.fit_transform(df_cat),
                     columns=df_cat.columns, index=df_cat.index)
df_cat