In [1]:
import sys

# For hiding passwords
import getpass

# For working with data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# For building models
from sklearn.linear_model import LogisticRegression

# Ignoring user warning
import warnings
# warnings.filterwarnings(action='ignore', category=UserWarning)
# warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

# For registering models
from sasctl import publish_model, pzmm, Session
from pathlib import Path
import os
import requests
import json

In [2]:
# Import data
data = pd.read_csv("./data/Detailed_Statistics_Arrivals_DM.csv")

In [3]:
data

Unnamed: 0.1,Unnamed: 0,Carrier,Date,FlightNumber,TailNumber,OriginAirport,ScheduledArrival,ActualArrival,ScheduledElapsedTime,ActualElapsedTime,...,DelayNAS,DelaySecurity,DelayLateAircraftArrival,DayofMonth,ScheduledArrivalDatetime,Delay,DepartureDatetime,ArrivalHour,DepartureHour,DepartureTime
0,0,American Airlines,09/01/2022,317,N904AA,DFW,22:27:00,00:18:00,167,161,...,0,0,30,1,2022-09-01 22:27:00,1,2022-09-01 19:40:00,22,19,19:40:00
1,1,American Airlines,09/01/2022,425,N934AA,PHX,21:38:00,21:50:00,71,63,...,0,0,0,1,2022-09-01 21:38:00,0,2022-09-01 20:27:00,21,20,20:27:00
2,2,American Airlines,09/01/2022,739,N997AA,LAX,18:49:00,18:53:00,74,79,...,0,0,0,1,2022-09-01 18:49:00,0,2022-09-01 17:35:00,18,17,17:35:00
3,3,American Airlines,09/01/2022,750,N537UW,CLT,16:09:00,15:54:00,276,268,...,0,0,0,1,2022-09-01 16:09:00,0,2022-09-01 11:33:00,16,11,11:33:00
4,4,American Airlines,09/01/2022,862,N323RM,PHX,18:01:00,17:54:00,70,70,...,0,0,0,1,2022-09-01 18:01:00,0,2022-09-01 16:51:00,18,16,16:51:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13510,13510,Southwest Airlines,09/30/2022,4577,N8805L,MSY,14:15:00,14:07:00,230,216,...,0,0,0,30,2022-09-30 14:15:00,0,2022-09-30 10:25:00,14,10,10:25:00
13511,13511,Southwest Airlines,09/30/2022,6508,N8530W,LAX,09:05:00,08:59:00,70,66,...,0,0,0,30,2022-09-30 09:05:00,0,2022-09-30 07:55:00,9,7,07:55:00
13512,13512,Southwest Airlines,09/30/2022,6513,N8773Q,SBA,20:55:00,20:38:00,75,67,...,0,0,0,30,2022-09-30 20:55:00,0,2022-09-30 19:40:00,20,19,19:40:00
13513,13513,Southwest Airlines,09/30/2022,6516,N939WN,SJC,10:05:00,10:11:00,90,85,...,0,0,0,30,2022-09-30 10:05:00,0,2022-09-30 08:35:00,10,8,08:35:00


In [4]:
# Specify inputs and target
inputs = ['Carrier', 'DayofMonth', 'OriginAirport', 'ScheduledElapsedTime', 'ArrivalHour', 'DepartureHour']
target = 'Delay'

# Create X and y datasets
X = data[inputs]
y = data[target]

In [5]:
# Separate training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# Create one-hot-encoding step
cat_cols = ['Carrier', 'OriginAirport']
cat_onehot_step = ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
cat_pipe = Pipeline([cat_onehot_step])
ct = ColumnTransformer(transformers=[('cat', cat_pipe, cat_cols)])

In [7]:
# Scikit-learn
# Create pipeline with one-hot-encoding and logistic regression
logreg_pipe = Pipeline([('transform', ct), ('logreg', LogisticRegression(solver='newton-cg'))])
logreg_pipe.fit(X_train, y_train)

In [8]:
train = logreg_pipe.score(X_train, y_train)
test = logreg_pipe.score(X_test, y_test)

print("Training accuracy: ", train)
print("Test accuracy: ", test)

Training accuracy:  0.7717758985200845
Test accuracy:  0.7694204685573366


In [9]:
# Register Models

# The folder where we can store our model files
output_folder = 'output'

# The model developer, in this case ours truly
modeler = "jpnpul"

# The project within SAS Model Manager
project = "Flight Delay Prediction"

# Model outputs
score_metrics = ["EM_CLASSIFICATION", "EM_EVENTPROBABILITY"]

In [10]:
from sasctl import Session
import getpass

hostname = input("Hostname: ")
username = input("Username: ")
password = getpass.getpass("Password: ")
 
sess = Session(hostname, username, password, verify_ssl=False, protocol="https")
conn = sess.as_swat()
conn

Hostname: cisviya.sas.com
Username: jpnpul
Password: ········




CAS('cisviya.sas.com', 443, protocol='https', name='py-session-1', session='25902130-fb52-cd4e-b228-753d5aac3fb3')

In [11]:
# SKLearn

# STEP 1: Initialize Variables

# The trained model
model = logreg_pipe

# Model name
model_prefix = 'SKLearn Logistic Regression v1'

# Model algorithm
algorithm = 'Logistic Regression'

# STEP 2: Create subfolder
model_path = Path.cwd() / output_folder / model_prefix
if not os.path.exists(model_path):
    os.makedirs(model_path)

# STEP 3: Save binary model representation
pzmm.PickleModel.pickle_trained_model(model_prefix=model_prefix, trained_model=model, pickle_path=model_path)

# STEP 4: Create metadata files

# Model inputs
pzmm.JSONFiles.write_var_json(input_data=X, is_input=True, json_path=model_path)

# Model outputs
output_var = pd.DataFrame(columns=score_metrics, data=[["A", 0.5]])
pzmm.JSONFiles.write_var_json(input_data=output_var, is_input=False, json_path=model_path)

# Model performance
train_data = y_train.to_frame(name='actual').reset_index(drop=True)
train_data['probability'] = model.predict_proba(X_train)[:,1]
train_data['predict'] = np.where(train_data['probability'] > 0.25, 1, 0)
train_data = train_data[['actual', 'predict', 'probability']]

test_data = y_test.to_frame(name='actual').reset_index(drop=True)
test_data['probability'] = model.predict_proba(X_test)[:,1]
test_data['predict'] = np.where(test_data['probability'] > 0.25, 1, 0)
test_data = test_data[['actual', 'predict', 'probability']]

pzmm.JSONFiles.calculate_model_statistics(target_value=1, prob_value=0.25, 
                                          train_data=train_data, test_data=test_data, json_path=model_path)

# Basic model information
pzmm.JSONFiles.write_file_metadata_json(model_prefix=model_prefix, json_path=model_path)

pzmm.JSONFiles.write_model_properties_json(model_name=model_prefix, target_variable=target,
                                           target_values=['1', '0'], json_path=model_path,
                                           model_algorithm=algorithm, modeler=modeler)

# Model requirements
requirements_json = pzmm.JSONFiles.create_requirements_json(model_path)
with open(Path(model_path) / 'requirements.json', 'w') as req_file:
    req_file.write(json.dumps(requirements_json, indent=4))
    
# STEP 5: Import model
pzmm.ScoreCode.score_code = ''
lreg = pzmm.ImportModel.import_model(model_files=model_path, model_prefix=model_prefix, project=project,
                                     input_data=X, predict_method=[model.predict_proba, [float, float]],
                                     score_metrics=score_metrics, overwrite_model=True,
                                     target_values=['0', '1'], model_file_name=model_prefix + ".pickle")

Model SKLearn Logistic Regression v1 was successfully pickled and saved to C:\code\sascode\python\output\SKLearn Logistic Regression v1\SKLearn Logistic Regression v1.pickle.
inputVar.json was successfully written and saved to C:\code\sascode\python\output\SKLearn Logistic Regression v1\inputVar.json
outputVar.json was successfully written and saved to C:\code\sascode\python\output\SKLearn Logistic Regression v1\outputVar.json




dmcas_fitstat.json was successfully written and saved to C:\code\sascode\python\output\SKLearn Logistic Regression v1\dmcas_fitstat.json
dmcas_roc.json was successfully written and saved to C:\code\sascode\python\output\SKLearn Logistic Regression v1\dmcas_roc.json
dmcas_lift.json was successfully written and saved to C:\code\sascode\python\output\SKLearn Logistic Regression v1\dmcas_lift.json
fileMetadata.json was successfully written and saved to C:\code\sascode\python\output\SKLearn Logistic Regression v1\fileMetadata.json
ModelProperties.json was successfully written and saved to C:\code\sascode\python\output\SKLearn Logistic Regression v1\ModelProperties.json




TypeError: list indices must be integers or slices, not NoneType

In [12]:
# STEP 5: Import model
pzmm.ScoreCode.score_code = ''
lreg = pzmm.ImportModel.import_model(model_files=model_path, model_prefix=model_prefix, project=project,
                                     input_data=X, predict_method=[model.predict_proba, [float, float]],
                                     score_metrics=score_metrics, overwrite_model=True,
                                     target_values=['0', '1'], model_file_name=model_prefix + ".pickle")

TypeError: list indices must be integers or slices, not NoneType

In [None]:
pzmm.ImportModel.import_model(model_files=model_path, model_prefix=model_prefix, project=mm_project, input_data=input_data, 
predict_method=[model.predict_proba, [float, float]], score_metrics=score_metrics, overwrite_model=True, target_values=["1", "0"], 
model_file_name=model_prefix + ".pickle")