Copyright © 2021, SAS Institute Inc., Cary, NC, USA.  All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

# HMEQ Dataset : Build and Import Trained Models into SAS Model Manager

This notebook provides an example of how to build and train a Python model and then import the model into SAS Model Manager using the fleet maintenance data set. Lines of code that must be modified by the user, such as directory paths are noted with the comment "_Changes required by user._".

_**Note:** If you download only this notebook and not the rest of the repository, you must also download the hmeq.csv file from the data folder in the examples directory. These files are used when executing this notebook example._

Here are the steps shown in this notebook:

1. Import and review data and preprocess for model training.
2. Build, train, and access a decision tree, random forest, and gradient boosting model.
3. Serialize the models into separate pickle files.
4. Write the metadata JSON files needed for importing into SAS Model Manager as well as optional files for fit statistics and ROC/Lift charts.
4. Write a score code Python file for model scoring.
5. Zip the pickle, JSON, and score code files into an archive file.
6. Import the ZIP archive file to SAS Model Manager via the Session object and relevant function call.

### Python Package Imports

In [None]:
# Dataframes for data manipulations
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
# Mathematical calculations and array handling
import numpy as np

# Data partitioning for TRAIN and TEST data sets
from sklearn.model_selection import train_test_split
# Decision tree, random forest, and gradient boosting models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# Model assessments 
from sklearn.metrics import classification_report, confusion_matrix

# Embedded plotting
import matplotlib.pyplot as plt 
plt.rc("font", size=14)

# Pathing support
from pathlib import Path

# sasctl interface for importing models
import sasctl.pzmm as pzmm
from sasctl import Session

### Import and Review Data Set

In [None]:
hmeqData = pd.read_csv('data/hmeq.csv',sep= ',')
hmeqData.shape

In [None]:
hmeqData.head()

In [None]:
hmeqData.hist(figsize=(15,15), layout=(4, 4));

In [None]:
hmeqData.columns

### Preprocess Data

In [None]:
predictorColumns = ['LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']

targetColumn = 'BAD'
x = hmeqData[predictorColumns]
y = hmeqData[targetColumn]

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.3, random_state=42)

# For missing values, impute the data set's mean value
xTest.fillna(xTest.mean(), inplace=True)
xTrain.fillna(xTrain.mean(), inplace=True)

### Create, Train, and Assess Model

In [None]:
treeModel = DecisionTreeClassifier(random_state=42)
treeModel = treeModel.fit(xTrain, yTrain)

forestModel = RandomForestClassifier(random_state=42)
forestModel = forestModel.fit(xTrain, yTrain)

gradientModel = GradientBoostingClassifier(random_state=42)
gradientModel = gradientModel.fit(xTrain, yTrain)

In [None]:
def sortFeatureImportance(model, xData):
    features = {}
    for importance, name in sorted(zip(model.feature_importances_, xData.columns), reverse=True):
        features[name] = str(np.round(importance*100, 2)) + '%'
    return features

importances = pd.DataFrame.from_dict(sortFeatureImportance(treeModel, xTrain), orient='index').rename(columns={0: 'DecisionTree'})
importances['RandomForest'] = pd.DataFrame.from_dict(sortFeatureImportance(forestModel, xTrain), orient='index')
importances['GradientBoosting'] = pd.DataFrame.from_dict(sortFeatureImportance(gradientModel, xTrain), orient='index')
importances

In [None]:
yTreePredict = treeModel.predict(xTest)
yTreeProba = treeModel.predict_proba(xTest)
print(confusion_matrix(yTest, yTreePredict))
print(classification_report(yTest, yTreePredict))
print('Decision Tree Model Accuracy = ' + str(np.round(treeModel.score(xTest, yTest)*100,2)) + '%')

In [None]:
yForestPredict = forestModel.predict(xTest)
yForestProba = forestModel.predict_proba(xTest)
print(confusion_matrix(yTest, yForestPredict))
print(classification_report(yTest, yForestPredict))
print('Random Forest Model Accuracy = ' + str(np.round(forestModel.score(xTest, yTest)*100,2)) + '%')

In [None]:
yGradientPredict = gradientModel.predict(xTest)
yGradientProba = gradientModel.predict_proba(xTest)
print(confusion_matrix(yTest, yGradientPredict))
print(classification_report(yTest, yGradientPredict))
print('Gradient Boosting Model Accuracy = ' + str(np.round(gradientModel.score(xTest, yTest)*100,2)) + '%')

### Register Model in SAS Model Manager with pzmm

In [None]:
modelPrefix = ['DecisionTreeClassifier', 'RandomForest', 'GradientBoosting']
zipFolder = [Path.cwd() / 'data/hmeqModels/DecisionTreeClassifier/',
             Path.cwd() / 'data/hmeqModels/RandomForest/',
             Path.cwd() / 'data/hmeqModels/GradientBoosting'] # User created directories
model = [treeModel, forestModel, gradientModel]

for (m, prefix, path) in zip(model, modelPrefix, zipFolder):
    pzmm.PickleModel.pickleTrainedModel(_, m, prefix, path)

In [None]:
def writeJSONFiles(data, predict, target, zipFolder, yTrain, modelPrefix):
    J = pzmm.JSONFiles()
    
    # Write input variable mapping to a json file
    J.writeVarJSON(data[predict], isInput=True, jPath=zipFolder)
    
    # Set output variables and assign an event threshold, then write output variable mapping
    outputVar = pd.DataFrame(columns=['EM_EVENTPROBABILITY', 'EM_CLASSIFICATION'])
    outputVar['EM_CLASSIFICATION'] = yTrain.astype('category').cat.categories.astype('str')
    outputVar['EM_EVENTPROBABILITY'] = 0.5 # Event threshold
    J.writeVarJSON(outputVar, isInput=False, jPath=zipFolder)
    
    # Write model properties to a json file
    J.writeModelPropertiesJSON(modelName=modelPrefix,
                               modelDesc='',
                               targetVariable=target,
                               modelType='',
                               modelPredictors=predict,
                               targetEvent=1,
                               numTargetCategories=1,
                               eventProbVar='EM_EVENTPROBABILITY',
                               jPath=zipFolder,
                               modeler='sasdemo')
    
    # Write model metadata to a json file
    J.writeFileMetadataJSON(modelPrefix, jPath=zipFolder)

for (prefix, path) in zip(modelPrefix, zipFolder):
    writeJSONFiles(hmeqData, predictorColumns, targetColumn, path, yTrain, prefix)

In [None]:
import getpass
def writeModelStats(xTrain, yTrain, testProba, yTest, model, target, zipFolder, conn):
    J = pzmm.JSONFiles()
    
    # Calculate train predictions
    trainProba = model.predict_proba(xTrain)
    
    # Assign data to lists of actual and predicted values
    trainData = pd.concat([yTrain.reset_index(drop=True), pd.Series(data=trainProba[:,1])], axis=1)
    testData = pd.concat([yTest.reset_index(drop=True), pd.Series(data=testProba[:,1])], axis=1)
    
    # Calculate the model statistics and write to json files
    J.calculateFitStat(trainData=trainData, testData=testData, jPath=zipFolder)
    J.generateROCLiftStat(target, 1, conn, trainData=trainData, testData=testData, jPath=zipFolder)
    
username = getpass.getpass()
password = getpass.getpass()
host = 'demo.sas.com'
sess = Session(host, username, password, protocol='http')
conn = sess.as_swat()

testProba = [yTreeProba, yForestProba, yGradientProba]
for (m, proba, path) in zip(model, testProba, zipFolder):
    writeModelStats(xTrain, yTrain, proba, yTest, m, targetColumn, path, conn)

In [None]:
I = pzmm.ImportModel()
for (prefix, path) in zip(modelPrefix, zipFolder):
    I.pzmmImportModel(path, prefix, 'HMEQModels', x, y, '{}.predict({})', force=True)