In [None]:
import sys

# For hiding passwords
import getpass

# For working with data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# For building models
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf

# Ignoring user warning
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', message='Unverified HTTPS request')

# For registering models
from sasctl import publish_model, pzmm, Session
from pathlib import Path
import os
import requests
import json

In [None]:
# Import data
data = pd.read_csv("./data/Detailed_Statistics_Arrivals.csv")

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
# Remap Airline Names
dict = {"AA" : 'American Airlines', 
        "AS" : 'Alaska Airlines', 
        "B6": 'JetBlue Airways', 
        "DL" : 'Delta Airlines', 
        "F9": 'Frontier Airlines',
        "NK": 'Spirit Airlines',
        "WN": 'Southwest Airlines'}

data = data.replace({"Carrier": dict})

In [None]:
# Clean up date columns
data['DayofMonth'] = pd.to_datetime(data['Date'], format='%m/%d/%Y').dt.day
data['ScheduledArrivalDatetime'] = pd.to_datetime((data['Date']) + data['ScheduledArrival'], format='%m/%d/%Y%H:%M')

In [None]:
# Create Delay
data['Delay'] = np.where(data['ArrivalDelay'] > 15, 1, 0)

In [None]:
# Create Departure Time
data['DepartureDatetime'] = data['ScheduledArrivalDatetime'] - pd.to_timedelta(data['ScheduledElapsedTime'], unit='minute')

# Extract Arrival and Departure Hour
data['ArrivalHour'] = pd.to_datetime(data['ScheduledArrivalDatetime']).dt.hour
data['DepartureHour'] = pd.to_datetime(data['DepartureDatetime']).dt.hour

In [None]:
# Clean up time columns
data = data.replace({'24:00': '23:59'})
data['ScheduledArrival'] = pd.to_datetime(data['ScheduledArrival'], format='%H:%M').dt.time
data['ActualArrival'] = pd.to_datetime(data['ActualArrival'], format='%H:%M').dt.time
data['DepartureTime'] = pd.to_datetime(data['DepartureDatetime'], format='%H:%M').dt.time

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.head()

In [None]:
data['OriginAirport'].value_counts()[:20].plot(kind='bar', xlabel='OriginAirport')

In [None]:
print("There are", data['OriginAirport'].nunique(), "unique origin airports in this dataset.")

print("There were", data[data['OriginAirport'] == 'RDU'].shape[0], "direct flights from RDU.")

In [None]:
pd.concat([data['Carrier'].value_counts(), data.groupby('Carrier')['Delay'].sum()], axis=1).plot.bar(xlabel='Carrier')

In [None]:
print(round(data['Delay'].sum() / len(data)* 100, 2), "% of flights were delayed")

In [None]:
# Specify inputs and target
inputs = ['Carrier', 'DayofMonth', 'OriginAirport', 'ScheduledElapsedTime', 'ArrivalHour', 'DepartureHour']
target = 'Delay'

# Create X and y datasets
X = data[inputs]
y = data[target]

In [None]:
# Separate training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Create one-hot-encoding step
cat_cols = ['Carrier', 'OriginAirport']
cat_onehot_step = ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
cat_pipe = Pipeline([cat_onehot_step])
ct = ColumnTransformer(transformers=[('cat', cat_pipe, cat_cols)])

In [None]:
# Scikit-learn
# Create pipeline with one-hot-encoding and logistic regression
logreg_pipe = Pipeline([('transform', ct), ('logreg', LogisticRegression(solver='newton-cg'))])
logreg_pipe.fit(X_train, y_train)

In [None]:
train = logreg_pipe.score(X_train, y_train)
test = logreg_pipe.score(X_test, y_test)

print("Training accuracy: ", train)
print("Test accuracy: ", test)

In [None]:
# XGBoost
# Create pipeline with one-hot-encoding and xgboost
xgb_pipe = Pipeline([('transform', ct), ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))])
xgb_pipe.fit(X_train, y_train)

In [None]:
train = xgb_pipe.score(X_train, y_train)
test = xgb_pipe.score(X_test, y_test)

print("Training accuracy: ", train)
print("Test accuracy :", test)

In [None]:
# MLflow

forest_pipe = Pipeline([('transform', ct), ('forest', RandomForestClassifier(n_estimators=50, max_depth=7))])
forest_pipe.fit(X_train, y_train)

In [None]:
signature = infer_signature(X_train, forest_pipe.predict(X_train))

In [None]:
train = forest_pipe.score(X_train, y_train)
test = forest_pipe.score(X_test, y_test)

print("Training accuracy: ", train)
mlflow.log_metric("Training accuracy", train)
print("Test accuracy: ", train)
mlflow.log_metric("Test accuracy", train)

In [None]:
mlflow.sklearn.log_model(forest_pipe, "model", signature=signature)
print("Model save in run %s" % mlflow.active_run().info.run_uuid)

In [None]:
# Keras
X_train_keras = X_train[['DayofMonth', 'ScheduledElapsedTime', 'ArrivalHour', 'DepartureHour']]
X_test_keras = X_test[['DayofMonth', 'ScheduledElapsedTime', 'ArrivalHour', 'DepartureHour']]

keras = tf.keras.Sequential([
        tf.keras.layers.Dense(42, activation="relu"),
        tf.keras.layers.Dense(72, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid")
])

keras.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
keras.fit(X_train_keras, y_train)

train = keras.evaluate(X_train_keras, y_train)[1]
test = keras.evaluate(X_test_keras, y_test)[1]

print("Training accuracy: ", train)
print("Test accuracy: ", test)

In [None]:
# Register Models

# The folder where we can store our model files
output_folder = 'output'

# The model developer, in this case ours truly
modeler = "jpnpul"

# The project within SAS Model Manager
project = "Flight Delay Prediction"

# Model outputs
score_metrics = ["EM_CLASSIFICATION", "EM_EVENTPROBABILITY"]

In [None]:
from sasctl import Session
import getpass

hostname = getpass.getpass("Hostname: ")
username = getpass.getpass("Username: ")
password = getpass.getpass("Password: ")
 
sess = Session(hostname, username, password, verify_ssl=False, protocol="https")
conn = sess.as_swat()
conn

In [None]:
# SKLearn

# STEP 1: Initialize Variables

# The trained model
model = logreg_pipe

# Model name
model_prefix = 'SKLearn Logistic Regression v1'

# Model algorithm
algorithm = 'Logistic Regression'

# STEP 2: Create subfolder
model_path = Path.cwd() / output_folder / model_prefix
if not os.path.exists(model_path):
    os.makedirs(model_path)

# STEP 3: Save binary model representation
pzmm.PickleModel.pickle_trained_model(model_prefix=model_prefix, trained_model=model, pickle_path=model_path)

# STEP 4: Create metadata files

# Model inputs
pzmm.JSONFiles.write_var_json(input_data=X, is_input=True, json_path=model_path)

# Model outputs
output_var = pd.DataFrame(columns=score_metrics, data=[["A", 0.5]])
pzmm.JSONFiles.write_var_json(input_data=output_var, is_input=False, json_path=model_path)

# Model performance
train_data = y_train.to_frame(name='actual').reset_index(drop=True)
train_data['probability'] = model.predict_proba(X_train)[:,1]
train_data['predict'] = np.where(train_data['probability'] > 0.25, 1, 0)
train_data = train_data[['actual', 'predict', 'probability']]

test_data = y_test.to_frame(name='actual').reset_index(drop=True)
test_data['probability'] = model.predict_proba(X_test)[:,1]
test_data['predict'] = np.where(test_data['probability'] > 0.25, 1, 0)
test_data = test_data[['actual', 'predict', 'probability']]

pzmm.JSONFiles.calculate_model_statistics(target_value=1, prob_value=0.25, 
                                          train_data=train_data, test_data=test_data, json_path=model_path)

# Basic model information
pzmm.JSONFiles.write_file_metadata_json(model_prefix=model_prefix, json_path=model_path)

pzmm.JSONFiles.write_model_properties_json(model_name=model_prefix, target_variable=target,
                                           target_values=['1', '0'], json_path=model_path,
                                           model_algorithm=algorithm, modeler=modeler)

# Model requirements
requirements_json = pzmm.JSONFiles.create_requirements_json(model_path)
with open(Path(model_path) / 'requirements.json', 'w') as req_file:
    req_file.write(json.dumps(requirements_json, indent=4))
    
# STEP 5: Import model
pzmm.ScoreCode.score_code = ''
lreg = pzmm.ImportModel.import_model(model_files=model_path, model_prefix=model_prefix, project=project,
                                     input_data=X, predict_method=[model.predict_proba, [float, float]],
                                     score_metrics=score_metrics, overwrite_model=True,
                                     target_values=['0', '1'], model_file_name=model_prefix + ".pickle")

In [None]:
# XGBoost

# STEP 1: Initialize Variables

# The trained model
model = xgb_pipe

# Model name
model_prefix = 'XGBoost v1'

# Model algorithm
algorithm = 'XGBoost'

# STEP 2: Create subfolder
model_path = Path.cwd() / output_folder / model_prefix
if not os.path.exists(model_path):
    os.makedirs(model_path)

# STEP 3: Save binary model representation
pzmm.PickleModel.pickle_trained_model(model_prefix=model_prefix, trained_model=model, pickle_path=model_path)

# STEP 4: Create metadata files

# Model inputs
pzmm.JSONFiles.write_var_json(input_data=X, is_input=True, json_path=model_path)

# Model outputs
output_var = pd.DataFrame(columns=score_metrics, data=[["A", 0.5]])
pzmm.JSONFiles.write_var_json(input_data=output_var, is_input=False, json_path=model_path)

# Model performance
train_data = y_train.to_frame(name='actual').reset_index(drop=True)
train_data['probability'] = model.predict_proba(X_train)[:,1]
train_data['predict'] = np.where(train_data['probability'] > 0.25, 1, 0)
train_data = train_data[['actual', 'predict', 'probability']]

test_data = y_test.to_frame(name='actual').reset_index(drop=True)
test_data['probability'] = model.predict_proba(X_test)[:,1]
test_data['predict'] = np.where(test_data['probability'] > 0.25, 1, 0)
test_data = test_data[['actual', 'predict', 'probability']]

pzmm.JSONFiles.calculate_model_statistics(target_value=1, prob_value=0.25, 
                                          train_data=train_data, test_data=test_data, json_path=model_path)

# Basic model information
pzmm.JSONFiles.write_file_metadata_json(model_prefix=model_prefix, json_path=model_path)

pzmm.JSONFiles.write_model_properties_json(model_name=model_prefix, target_variable=target,
                                           target_values=['1', '0'], json_path=model_path,
                                           model_algorithm=algorithm, modeler=modeler)

# Model requirements
requirements_json = pzmm.JSONFiles.create_requirements_json(model_path)
with open(Path(model_path) / 'requirements.json', 'w') as req_file:
    req_file.write(json.dumps(requirements_json, indent=4))
    
# STEP 5: Import model
pzmm.ScoreCode.score_code = ''
lreg = pzmm.ImportModel.import_model(model_files=model_path, model_prefix=model_prefix, project=project,
                                     input_data=X, predict_method=[model.predict_proba, [float, float]],
                                     score_metrics=score_metrics, overwrite_model=True,
                                     target_values=['0', '1'], model_file_name=model_prefix + ".pickle")

In [None]:
# MLFlow

# STEP 1: Initialize Variables

# The trained model
model = forest_pipe

# Model name
model_prefix = 'MLflow Forest v1'

# Model algorithm
algorithm = 'Random Forest'

# Model location
mlflow_model_path = Path("./mlruns/0/" + mlflow.active_run().info.run_uuid + "/artifacts/model")

# MLflow model files
metadata_dict, inputs_dict, outputs_dict = pzmm.MLFlowModel.read_mlflow_model_file(mlflow_model_path)


# STEP 2: Create subfolder
model_path = Path.cwd() / output_folder / model_prefix
if not os.path.exists(model_path):
    os.makedirs(model_path)

# STEP 3: Save binary model representation
pzmm.PickleModel.pickle_trained_model(model_prefix=model_prefix, pickle_path=model_path, mlflow_details=metadata_dict)

# STEP 4: Create metadata files

# Model inputs
pzmm.JSONFiles.write_var_json(input_data=X, is_input=True, json_path=model_path)

# Model outputs
output_var = pd.DataFrame(columns=score_metrics, data=[["A", 0.5]])
pzmm.JSONFiles.write_var_json(input_data=output_var, is_input=False, json_path=model_path)

# Model performance
train_data = y_train.to_frame(name='actual').reset_index(drop=True)
train_data['probability'] = model.predict_proba(X_train)[:,1]
train_data['predict'] = np.where(train_data['probability'] > 0.25, 1, 0)
train_data = train_data[['actual', 'predict', 'probability']]

test_data = y_test.to_frame(name='actual').reset_index(drop=True)
test_data['probability'] = model.predict_proba(X_test)[:,1]
test_data['predict'] = np.where(test_data['probability'] > 0.25, 1, 0)
test_data = test_data[['actual', 'predict', 'probability']]

pzmm.JSONFiles.calculate_model_statistics(target_value=1, prob_value=0.25, 
                                          train_data=train_data, test_data=test_data, json_path=model_path)

# Basic model information
pzmm.JSONFiles.write_file_metadata_json(model_prefix=model_prefix, json_path=model_path)

pzmm.JSONFiles.write_model_properties_json(model_name=model_prefix, target_variable="tensor",
                                           target_values=['1', '0'], json_path=model_path,
                                           model_algorithm=algorithm, modeler=modeler)

# Model requirements
requirements_json = pzmm.JSONFiles.create_requirements_json(model_path)
with open(Path(model_path) / 'requirements.json', 'w') as req_file:
    req_file.write(json.dumps(requirements_json, indent=4))
    
# STEP 5: Import model
pzmm.ScoreCode.score_code = ''
lreg = pzmm.ImportModel.import_model(model_files=model_path, model_prefix=model_prefix, project=project,
                                     input_data=X, predict_method=[model.predict_proba, [float, float]],
                                     score_metrics=["tensor"], overwrite_model=True,
                                     target_values=['0', '1'], pickle_type=metadata_dict["serialization_format"],
                                     model_file_name=model_prefix + ".pickle")

In [None]:
# Keras

# STEP 1: Initialize Variables

# The trained model
model = keras

# Model name
model_prefix = 'Keras Neural Network v1'

# Model algorithm
algorithm = 'Neural Network'

# STEP 2: Create subfolder
model_path = Path.cwd() / output_folder / model_prefix
if not os.path.exists(model_path):
    os.makedirs(model_path)

# STEP 3: Save binary model representation
model.save(output_folder + '/' + model_prefix + '/tfmodel.h5')

# STEP 4: Create metadata files

# Model inputs
pzmm.JSONFiles.write_var_json(input_data=X, is_input=True, json_path=model_path)

# Model outputs
output_var = pd.DataFrame(columns=score_metrics, data=[["A", 0.5]])
pzmm.JSONFiles.write_var_json(input_data=output_var, is_input=False, json_path=model_path)

# Model performance
train_data = y_train.to_frame(name='actual').reset_index(drop=True)
train_data['probability'] = model.predict(X_train_keras)
train_data['predict'] = np.where(train_data['probability'] > 0.25, 1, 0)
train_data = train_data[['actual', 'predict', 'probability']]

test_data = y_test.to_frame(name='actual').reset_index(drop=True)
test_data['probability'] = model.predict(X_test_keras)
test_data['predict'] = np.where(test_data['probability'] > 0.25, 1, 0)
test_data = test_data[['actual', 'predict', 'probability']]

pzmm.JSONFiles.calculate_model_statistics(target_value=1, prob_value=0.25, 
                                          train_data=train_data, test_data=test_data, json_path=model_path)

# Basic model information
pzmm.JSONFiles.write_file_metadata_json(model_prefix=model_prefix, json_path=model_path)

pzmm.JSONFiles.write_model_properties_json(model_name=model_prefix, target_variable=target,
                                           target_values=['1', '0'], json_path=model_path,
                                           model_algorithm=algorithm, modeler=modeler)

# Model requirements
requirements_json = pzmm.JSONFiles.create_requirements_json(model_path)
with open(Path(model_path) / 'requirements.json', 'w') as req_file:
    req_file.write(json.dumps(requirements_json, indent=4))
    
# STEP 5: Import model
pzmm.ScoreCode.score_code = ''
lreg = pzmm.ImportModel.import_model(model_files=model_path, model_prefix=model_prefix, project=project,
                                     input_data=X_train_keras, predict_method=[model.predict, [int, int]],
                                     score_metrics=score_metrics, overwrite_model=True,
                                     target_values=['0', '1'], model_file_name=model_prefix + ".h5",
                                     tf_keras_model=True)