# Predicting Flight Delays

This example shows use of classification models to predict flight delays. 
Original example can be found [here](https://github.com/frenchlam/dask_CDSW/blob/master/03_Dask_ML-LargeDS.ipynb).

### Notes on running this example:

By defaults runs use Bodo. Hence, data is distributed in chunks across processes.

To run the code:
1. Make sure you [add your AWS account credentials to Saturn Cloud](https://saturncloud.io/docs/examples/python/load-data/qs-load-data-s3/#create-aws-credentials) to access the data.
2. If you want to run the example using pandas only (without Bodo):
    1. Comment lines magic expression (`%%px`) and bodo decorator (`@bodo.jit`) from all the code cells.
    2. Then, re-run cells from the beginning.

### Start an IPyParallel cluster
Run the following code in a cell to start an IPyParallel cluster. 4 cores are used in this example.

In [None]:
import ipyparallel as ipp
import psutil

n = min(psutil.cpu_count(logical=False), 8)
rc = ipp.Cluster(engines="mpi", n=n).start_and_connect_sync(activate=True)

### Verifying your setup
Run the following code to verify that your IPyParallel cluster is set up correctly:

In [None]:
%%px
import bodo

print(f"Hello World from rank {bodo.get_rank()}. Total ranks={bodo.get_size()}")

## Importing the Packages

These are the main packages we are going to work with:
 - Bodo to parallelize Python code automatically
 - Pandas to work with data
 - Scikit-learn to build and evaluate regression models

In [None]:
%%px
import time

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Part 1. Pre-processing in Pandas

### 1. Read flights dataset

In [None]:
%%px
@bodo.jit(distributed=["flight_df"], cache=True)
def read_flights(input_file):
    flight_df = pd.read_csv(
        input_file,
        sep=",",
        header=0,
        usecols=[
            "Month",
            "DayofMonth",
            "DayOfWeek",
            "CRSDepTime",
            "CRSArrTime",
            "UniqueCarrier",
            "FlightNum",
            "Origin",
            "Dest",
            "Cancelled",
        ],
    )
    print(flight_df.head())
    return flight_df


input_file = "s3://bodo-example-data/flights/1988.csv.bz2"
flight_df = read_flights(input_file)

### 2. Feature Engineering
1. Create routes from origin and destination

In [None]:
%%px
@bodo.jit(distributed=["flight_df"], cache=True)
def create_routes(flight_df):
    flight_df["route"] = flight_df["Origin"] + "_" + flight_df["Dest"]
    # show top 20 routes - As defined by nb of flights
    top_routes = flight_df["route"].value_counts(ascending=False)
    print(top_routes.head(10))
    # focus on 50 biggest routes - As defined by nb of flights
    route_lst = top_routes.head(50)
    flight_df = flight_df[flight_df["route"].isin(route_lst.index)]
    return flight_df


flight_df = create_routes(flight_df)

2. Look at their cancellations

In [None]:
%%px
@bodo.jit(distributed=["flight_df"], cache=True)
def check_cancelations(flight_df):
    res = (
        flight_df[["route", "Cancelled", "Month"]]
        .groupby(by="route")
        .agg({"Month": "size", "Cancelled": "sum"})
        .rename(columns={"Month": "count", "Cancelled": "nb_cancelled"})
        .reset_index()
        .sort_values(["count"], ascending=False)
    )
    print(res.head(10))


check_cancelations(flight_df)

In [None]:
%%px
@bodo.jit(distributed=["flight_df"])
def print_info(flight_df):
    print(flight_df.shape)


print_info(flight_df)

3. Quick sanity check - count number of null values()

In [None]:
%%px
@bodo.jit(distributed=["flight_df"])
def check_count(flight_df):

    print(flight_df.isnull().sum())


check_count(flight_df)

### 3. Feature and label encoding encoding

#### 1. Encode Labels using Cancelled column

In [None]:
%%px
@bodo.jit(distributed=["flight_df"], cache=True)
def encode_labels(flight_df):
    flight_df.Cancelled = pd.Categorical(flight_df.Cancelled)
    flight_df["Label"] = flight_df.Cancelled.cat.codes
    flight_df.drop(["Cancelled"], axis=1, inplace=True)
    return flight_df


flight_df = encode_labels(flight_df)

#### 2. Feature Encoding

This is needed because sklearn only supports numerical values

a. Get airport unique values

b. Encode origin, destination, and route features

In [None]:
%%px
@bodo.jit(distributed=["flight_df"], cache=True)
def get_airport_list(flight_df):
    airport_list = np.sort(
        (pd.concat((flight_df["Origin"], flight_df["Dest"]))).unique()
    )
    return airport_list


airport_list = get_airport_list(flight_df)

In [None]:
%%px
@bodo.jit(distributed=["flight_df", "airport_list"], cache=True)
def encode_features(flight_df, airport_list):
    t1 = time.time()
    # encode airlines
    le_carrier = LabelEncoder()
    flight_df["Carrier_encoded"] = pd.Series(
        le_carrier.fit_transform(flight_df["UniqueCarrier"].values)
    )
    # Encode airports : Using same encoder for both origin and dest ( consistent encoding of airports )
    le_airport = LabelEncoder()
    le_airport.fit(airport_list)
    flight_df["Origin_encoded"] = pd.Series(le_airport.transform(flight_df["Origin"]))
    flight_df["Dest_encoded"] = pd.Series(le_airport.transform(flight_df["Dest"]))
    # Encode routes
    le_route = LabelEncoder()
    flight_df["route_encoded"] = pd.Series(
        le_route.fit_transform(flight_df["route"].values)
    )
    print("Encoding time: ", (time.time() - t1), " sec")
    return flight_df


flight_df = encode_features(flight_df, airport_list)

In [None]:
%%px
@bodo.jit(distributed=["flight_df"], cache=True)
def sample(flight_df):
    print(
        flight_df[
            [
                "UniqueCarrier",
                "Carrier_encoded",
                "Origin",
                "Origin_encoded",
                "Dest",
                "Dest_encoded",
                "route",
                "route_encoded",
            ]
        ].sample(10)
    )


sample(flight_df)

In [None]:
%%px
@bodo.jit(
    distributed=["flight_df", "X_train", "X_test", "y_train", "y_test"], cache=True
)
def split_data(flight_df):
    t1 = time.time()
    X_train, X_test, y_train, y_test = train_test_split(
        flight_df.drop(["UniqueCarrier", "Origin", "Dest", "route"], axis=1),
        flight_df["Label"],
        test_size=0.3,
        train_size=0.7,
        random_state=100,
    )
    print("Data splitting time: ", (time.time() - t1), " sec")

    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = split_data(flight_df)

## Part 2: Model Training - Using Scikit-learn

### 1. RandomForestClassifier

In [None]:
%%px
@bodo.jit(distributed=["X_train", "y_train", "X_test", "y_test"], cache=True)
def rf_model(X_train, X_test, y_train, y_test):
    start = time.time()
    rf = RandomForestClassifier()
    rf.fit(X_train.to_numpy(), y_train.values)
    y_pred = rf.predict(X_test.to_numpy())
    print("RandomForestClassifier fit and predict time: ", time.time() - start)
    print("Accuracy score {}".format(accuracy_score(y_test, y_pred)))


rf_model(X_train, X_test, y_train, y_test)

### 2. Logistic Regression

In [None]:
%%px
@bodo.jit(distributed=["X_train", "y_train", "X_test", "y_test"], cache=True)
def lr_model(X_train, X_test, y_train, y_test):
    start = time.time()
    lr = LogisticRegression()
    lr.fit(X_train.to_numpy(), y_train.values)
    y_pred = lr.predict(X_test.to_numpy())
    print("Logistic Regression fit and predict time: ", time.time() - start)
    print("Accuracy score {}".format(accuracy_score(y_test, y_pred)))


lr_model(X_train, X_test, y_train, y_test)

In [None]:
# To stop the cluster run the following command.
rc.cluster.stop_cluster_sync()