Copyright © 2021, SAS Institute Inc., Cary, NC, USA.  All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

# HMEQ Dataset : Build and Import Trained Models into SAS Model Manager

This notebook provides an example of how to build and train a Python model and then import the model into SAS Model Manager using the fleet maintenance data set. Lines of code that must be modified by the user, such as directory paths are noted with the comment "_Changes required by user._".

_**Note:** If you download only this notebook and not the rest of the repository, you must also download the hmeq.csv file from the data folder in the examples directory. These files are used when executing this notebook example._

Here are the steps shown in this notebook:

1. Import and review data and preprocess for model training.
2. Build, train, and access a decision tree, random forest, and gradient boosting model.
3. Serialize the models into separate pickle files.
4. Write the metadata JSON files needed for importing into SAS Model Manager as well as optional files for fit statistics and ROC/Lift charts.
4. Write a score code Python file for model scoring.
5. Zip the pickle, JSON, and score code files into an archive file.
6. Import the ZIP archive file to SAS Model Manager via the Session object and relevant function call.

### Python Package Imports

In [None]:
# Dataframes for data manipulations
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
# Mathematical calculations and array handling
import numpy as np

# Data partitioning for TRAIN and TEST data sets
from sklearn.model_selection import train_test_split
# Decision tree, random forest, and gradient boosting models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# Model assessments 
from sklearn.metrics import classification_report, confusion_matrix

# Embedded plotting
import matplotlib.pyplot as plt 
plt.rc("font", size=14)

# Pathing support
from pathlib import Path

# Import H2O and check the version
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

# sasctl interface for importing models
import sasctl.pzmm as pzmm
from sasctl import Session
from sasctl.services import model_repository as modelRepo

In [None]:
h2o.__version__

In [None]:
h2o.init()

### Import and Review Data Set

In [None]:
hmeqData = h2o.import_file('data/hmeq.csv',sep= ',')
hmeqData.shape

### Preprocess Data

In [None]:
hmeqData['BAD'] = hmeqData['BAD'].asfactor()

train, validation, test = hmeqData.split_frame(ratios=[.6, .2], seed=42)

y = 'BAD'
x = list(hmeqData.columns)
x.remove(y)

### Create, Train, and Assess Model

In [None]:
glmFit = H2OGeneralizedLinearEstimator(family='binomial', model_id='glmfit', lambda_search=True)
glmFit.train(x=x, y=y, training_frame=train, validation_frame=validation)

In [None]:
# Check the model performance and print its accuracy
glmPerf = glmFit.model_performance(test)
print(glmPerf.accuracy())

In [None]:
h2o.save_model(glmFit, path='data/hmeqModels/H2OBinaryGLM')

### Register Model in SAS Model Manager with pzmm

In [None]:
modelPrefix = 'glmFit'
zipFolder = Path.cwd() / 'data/hmeqModels/H2OBinaryGLM/'
pzmm.PickleModel.pickleTrainedModel(_, glmFit, modelPrefix, zipFolder, isH2OModel=True, isBinaryModel=True)

In [None]:
trainDF = train.as_data_frame()
J = pzmm.JSONFiles()

# Write input variable mapping to a json file
J.writeVarJSON(trainDF[x], isInput=True, jPath=zipFolder)

# Set output variables and assign an event threshold, then write output variable mapping
outputVar = pd.DataFrame(columns=['EM_EVENTPROBABILITY', 'EM_CLASSIFICATION'])
outputVar['EM_CLASSIFICATION'] = trainDF[y].astype('category').cat.categories.astype('str')
outputVar['EM_EVENTPROBABILITY'] = 0.5 # Event threshold
J.writeVarJSON(outputVar, isInput=False, jPath=zipFolder)

# Write model properties to a json file
J.writeModelPropertiesJSON(modelName=modelPrefix,
                            modelDesc='',
                            targetVariable=y,
                            modelType='',
                            modelPredictors=x,
                            targetEvent=1,
                            numTargetCategories=1,
                            eventProbVar='EM_EVENTPROBABILITY',
                            jPath=zipFolder,
                            modeler='sasdemo')

# Write model metadata to a json file
J.writeFileMetadataJSON(modelPrefix, jPath=zipFolder)

In [None]:
import getpass
username = getpass.getpass()
password = getpass.getpass()
host = 'fall2020patch3.edm.sashq-r.openstack.sas.com'#'sas.demo.com'
sess = Session(host, username, password, protocol='http')

In [None]:
pzmm.ImportModel.pzmmImportModel(zipFolder, modelPrefix, 'BinaryH2OExample', trainDF[x], trainDF[y], predictmethod=None, isH2OModel=True, force=True)
