This code will eventually go into a pypi package

In [None]:
import pickle
import pandas
import json
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
import os
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def hyperparameter(s):
    j = json.loads(os.environ['SC_MODEL_HYPERPARAMS'])
    return j.get(s)
def model_cfg(s):
    j = json.loads(os.environ['SC_MODEL_CFG'])
    return j.get(s)

def write_fit(fit):
    with open(os.environ['SC_MODEL_FIT_DEST'], 'wb') as handle:
        pickle.dump(fit, handle, protocol=pickle.HIGHEST_PROTOCOL)

def set_metrics(ms):
    with open(os.environ['SC_MODEL_METRICS_DEST'], 'w') as fp:
        json.dump(ms, fp)

def data_file_path():
    return os.environ['SC_MODEL_DATA_FILE']

This is the code to train

In [None]:
df = pandas.read_csv(data_file_path())
dfxs = df.iloc[:, :-1]
dfy = df.iloc[:, -1]
modeltype = model_cfg('regression_model')
do_split = model_cfg('split_test_train')
if do_split:
  X_train, X_test, y_train, y_test = train_test_split(dfxs, dfy, test_size=0.20, random_state=42)
else:
  X_train = dfxs
  X_test = dfxs
  y_train = dfy
  y_test = dfy

if modeltype == "Linear":
    from sklearn.linear_model import LinearRegression
    model = LinearRegression().fit(X_train, y_train)
elif modeltype == "Ridge":
    from sklearn.linear_model import Ridge
    model = Ridge(alpha=hyperparameter('regularization')).fit(X_train, y_train)
elif modeltype == "Lasso":
    from sklearn.linear_model import Lasso
    model = Lasso(alpha=hyperparameter('regularization')).fit(X_train, y_train)
elif modeltype == "Random Forest":
    from sklearn.ensemble import RandomForestRegressor
    if hyperparameter('pca'):
        model = Pipeline(steps=[("scaler", StandardScaler()), 
                                ("pca", PCA(n_components=hyperparameter('components'))), 
                                ("randomforest", RandomForestRegressor())]).fit(X_train, y_train)        
    else:
        model = RandomForestRegressor().fit(X_train, y_train)
elif modeltype == "Partial Least Squares":
    from sklearn.cross_decomposition import PLSRegression
    model = PLSRegression(n_components=hyperparameter('components')).fit(X_train, y_train)
elif modeltype == "Custom Python Code":
    exec(model_cfg("model_code"))    
elif modeltype == "Support Vector Machine":
    from sklearn.svm import SVR
    kernel = hyperparameter('kernel')
    if kernel == 'poly':
        regr = SVR(kernel=kernel, C=hyperparameter('C'), degree=hyperparameter('degree'))
    else:
        regr = SVR(kernel=kernel, C=hyperparameter('C'))
    model = regr.fit(X_train, y_train)

write_fit(model)

set_metrics({
    'R2': model.score(X_test, y_test),
})

In [None]:
do_split

A plot to detect model quality

In [None]:
predicted = model.predict(X_test)
residuals = predicted - y_test
from matplotlib import pyplot as plt
plt.figure(figsize=(6, 5))
plt.scatter(y_test, residuals)
plt.show()

Predicted vs observed in outcomes

In [None]:
from matplotlib import pyplot as plt
outcome_field = model_cfg("outcome_field")
plt.figure(figsize=(6, 5))
plt.scatter(predicted, y_test)
plt.xlabel("Predicted "+outcome_field)
plt.ylabel("Observed "+outcome_field)
plt.show()

This is the predict code

In [None]:
import pickle
import pandas

def predict(model_pickle_file, data_csv_file):
  with open(model_pickle_file, "rb") as input_file:
    mdl1 = pickle.load(input_file)
    predcsv = pandas.read_csv(data_csv_file)
    return {
        'yhat': list(mdl1.predict(predcsv))
      }