Copyright © 2020, SAS Institute Inc., Cary, NC, USA.  All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

### Python Package Imports

In [1]:
# Dataframes for data manipulations
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
# Mathematical calculations and array handling
import numpy as np

# Data partitioning for TRAIN and TEST data sets
from sklearn.model_selection import train_test_split

# Embedded plotting
import matplotlib.pyplot as plt 
plt.rc("font", size=14)

# Pathing support
from pathlib import Path

# sasctl interface for importing models
import sasctl.pzmm as pzmm
from sasctl import Session
from sasctl.services import model_repository as modelRepo

### Import and Review Data Set

In [14]:
housingData = pd.read_csv('data/USA_Housing.csv',sep= ',')
housingData.shape

(5000, 7)

In [15]:
housingData = housingData.drop(['Address'], axis=1)
housingData.head()

Unnamed: 0,Avg_Area_Income,Avg_Area_House_Age,Avg_Area_Number_of_Rooms,Avg_Area_Number_of_Bedrooms,Area_Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [16]:
housingData.columns

Index(['Avg_Area_Income', 'Avg_Area_House_Age', 'Avg_Area_Number_of_Rooms',
       'Avg_Area_Number_of_Bedrooms', 'Area_Population', 'Price'],
      dtype='object')

### Preprocess Data

In [17]:
# Input 
predictorColumns = ['Avg_Area_Income', 'Avg_Area_House_Age', 'Avg_Area_Number_of_Rooms', 
                    'Avg_Area_Number_of_Bedrooms', 'Area_Population']

# Target
targetColumn = 'Price'
x = housingData[predictorColumns]
y = housingData[targetColumn]

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.3, random_state=42)

# For missing values, impute the data set's mean value
xTest.fillna(xTest.mean(), inplace=True)
xTrain.fillna(xTrain.mean(), inplace=True)
print(xTest.shape)
print(xTrain.shape)

(1500, 5)
(3500, 5)


### Create, Train, and Assess Model

In [18]:
# Linear Regression Training
from sklearn.linear_model import LinearRegression
linReg = LinearRegression(normalize=True)
linReg.fit(xTrain, yTrain)

LinearRegression(normalize=True)

In [20]:
# Test Predictions
from sklearn import metrics
LRPredict = linReg.predict(xTest)
print(metrics.r2_score(yTest, LRPredict))

0.9146818498916267


### Zip file for registering into SAS Model Manager

In [22]:
modelPrefix = 'LinearRegression'
zipFolder = Path.cwd() / 'data/USAHousingModels/LinearRegression'

pzmm.PickleModel.pickle_trained_model(_, linReg, modelPrefix, zipFolder)

Model LinearRegression was successfully pickled and saved to c:\Users\sclind\Documents\Python Scripts\GitHub\sassoftware\python-sasctl\examples\data\USAHousingModels\LinearRegression\LinearRegression.pickle.


In [23]:
def writeJSONFiles(data, predict, target, zipFolder, modelPrefix):
    J = pzmm.JSONFiles()
    
    # Write input variable mapping to a json file
    J.writeVarJSON(data[predict], isInput=True, jPath=zipFolder)
    
    # Set output variables and assign an event threshold, then write output variable mapping
    outputVar = pd.DataFrame(columns=['EM_PREDICTION'])
    outputVar['EM_PREDICTION'].loc[1] = 0.5
    J.writeVarJSON(outputVar, isInput=False, jPath=zipFolder)
    
    # Write model properties to a json file
    J.writeModelPropertiesJSON(modelName=modelPrefix,
                               modelDesc='',
                               targetVariable=target,
                               modelType='',
                               modelPredictors=predict,
                               targetEvent=None,
                               numTargetCategories=1,
                               eventProbVar='EM_PREDICTION',
                               jPath=zipFolder,
                               modeler='sasdemo')
    
    # Write model metadata to a json file
    J.writeFileMetadataJSON(modelPrefix, jPath=zipFolder)

writeJSONFiles(housingData, predictorColumns, targetColumn, zipFolder, modelPrefix)

inputVar.json was successfully written and saved to c:\Users\sclind\Documents\Python Scripts\GitHub\sassoftware\python-sasctl\examples\data\USAHousingModels\LinearRegression\inputVar.json
outputVar.json was successfully written and saved to c:\Users\sclind\Documents\Python Scripts\GitHub\sassoftware\python-sasctl\examples\data\USAHousingModels\LinearRegression\outputVar.json
ModelProperties.json was successfully written and saved to c:\Users\sclind\Documents\Python Scripts\GitHub\sassoftware\python-sasctl\examples\data\USAHousingModels\LinearRegression\ModelProperties.json
fileMetaData.json was successfully written and saved to c:\Users\sclind\Documents\Python Scripts\GitHub\sassoftware\python-sasctl\examples\data\USAHousingModels\LinearRegression\fileMetaData.json


In [24]:
import getpass
username = getpass.getpass()
password = getpass.getpass()
host = 'demo.sas.com'
sess = Session(host, username, password, protocol='http')

In [27]:
I = pzmm.ImportModel()
I.pzmmImportModel(zipFolder, modelPrefix, 'LinearRegressionModelExample', x, y, '{}.predict({})', force=True, metrics=['EM_PREDICTION', 'EM_PREDICTION'])

All model files were zipped to c:\Users\sclind\Documents\Python Scripts\GitHub\sassoftware\python-sasctl\examples\data\USAHousingModels\LinearRegression.
Model was successfully imported into SAS Model Manager as LinearRegression with UUID: cb485d04-6cb9-48ce-a829-67b8167303ce.
Model score code was written successfully to c:\Users\sclind\Documents\Python Scripts\GitHub\sassoftware\python-sasctl\examples\data\USAHousingModels\LinearRegression\LinearRegressionScore.py and uploaded to SAS Model Manager
