In [None]:
import sys

# For hiding passwords
import getpass

# For working with data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# For building models
from sklearn.linear_model import LogisticRegression

# Ignoring user warning
import warnings
# warnings.filterwarnings(action='ignore', category=UserWarning)
# warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

# For registering models
from sasctl import publish_model, pzmm, Session
from pathlib import Path
import os
import requests
import json

In [None]:
# Import data
data = pd.read_csv("./data/Detailed_Statistics_Arrivals_DM.csv")

In [None]:
data

In [None]:
# Specify inputs and target
inputs = ['Carrier', 'DayofMonth', 'OriginAirport', 'ScheduledElapsedTime', 'ArrivalHour', 'DepartureHour']
target = 'Delay'

# Create X and y datasets
X = data[inputs]
y = data[target]

In [None]:
# Separate training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Create one-hot-encoding step
cat_cols = ['Carrier', 'OriginAirport']
cat_onehot_step = ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
cat_pipe = Pipeline([cat_onehot_step])
ct = ColumnTransformer(transformers=[('cat', cat_pipe, cat_cols)])

In [None]:
# Scikit-learn
# Create pipeline with one-hot-encoding and logistic regression
logreg_pipe = Pipeline([('transform', ct), ('logreg', LogisticRegression(solver='newton-cg'))])
logreg_pipe.fit(X_train, y_train)

In [None]:
train = logreg_pipe.score(X_train, y_train)
test = logreg_pipe.score(X_test, y_test)

print("Training accuracy: ", train)
print("Test accuracy: ", test)

In [None]:
# Register Models

# The folder where we can store our model files
output_folder = 'output'

# The model developer, in this case ours truly
modeler = "jpnpul"

# The project within SAS Model Manager
project = "Flight Delay Prediction"

# Model outputs
score_metrics = ["EM_CLASSIFICATION", "EM_EVENTPROBABILITY"]

In [None]:
from sasctl import Session
import getpass

hostname = input("Hostname: ")
username = input("Username: ")
password = getpass.getpass("Password: ")
 
sess = Session(hostname, username, password, verify_ssl=False, protocol="https")
conn = sess.as_swat()
conn

In [None]:
# SKLearn

# STEP 1: Initialize Variables

# The trained model
model = logreg_pipe

# Model name
model_prefix = 'SKLearn Logistic Regression v1'

# Model algorithm
algorithm = 'Logistic Regression'

# STEP 2: Create subfolder
model_path = Path.cwd() / output_folder / model_prefix
if not os.path.exists(model_path):
    os.makedirs(model_path)

# STEP 3: Save binary model representation
pzmm.PickleModel.pickle_trained_model(model_prefix=model_prefix, trained_model=model, pickle_path=model_path)

# STEP 4: Create metadata files

# Model inputs
pzmm.JSONFiles.write_var_json(input_data=X, is_input=True, json_path=model_path)

# Model outputs
output_var = pd.DataFrame(columns=score_metrics, data=[["A", 0.5]])
pzmm.JSONFiles.write_var_json(input_data=output_var, is_input=False, json_path=model_path)

# Model performance
train_data = y_train.to_frame(name='actual').reset_index(drop=True)
train_data['probability'] = model.predict_proba(X_train)[:,1]
train_data['predict'] = np.where(train_data['probability'] > 0.25, 1, 0)
train_data = train_data[['actual', 'predict', 'probability']]

test_data = y_test.to_frame(name='actual').reset_index(drop=True)
test_data['probability'] = model.predict_proba(X_test)[:,1]
test_data['predict'] = np.where(test_data['probability'] > 0.25, 1, 0)
test_data = test_data[['actual', 'predict', 'probability']]

pzmm.JSONFiles.calculate_model_statistics(target_value=1, prob_value=0.25, 
                                          train_data=train_data, test_data=test_data, json_path=model_path)

# Basic model information
pzmm.JSONFiles.write_file_metadata_json(model_prefix=model_prefix, json_path=model_path)

pzmm.JSONFiles.write_model_properties_json(model_name=model_prefix, target_variable=target,
                                           target_values=['1', '0'], json_path=model_path,
                                           model_algorithm=algorithm, modeler=modeler)

# Model requirements
requirements_json = pzmm.JSONFiles.create_requirements_json(model_path)
with open(Path(model_path) / 'requirements.json', 'w') as req_file:
    req_file.write(json.dumps(requirements_json, indent=4))
    
# STEP 5: Import model
pzmm.ScoreCode.score_code = ''
lreg = pzmm.ImportModel.import_model(model_files=model_path, model_prefix=model_prefix, project=project,
                                     input_data=X, predict_method=[model.predict_proba, [float, float]],
                                     score_metrics=score_metrics, overwrite_model=True,
                                     target_values=['0', '1'], model_file_name=model_prefix + ".pickle")