In [0]:
import os
import shutil
import dataiku
import pandas as pd
import mlflow

In [0]:
client = dataiku.api_client()
project = client.get_default_project()

In [0]:
# get train dataset
train_dataset = dataiku.Dataset("training_data")
evaluation_dataset = dataiku.Dataset("eval_data")

In [0]:
# get output saved model
sm = project.get_saved_model("VdHxdbkg")

# get train dataset as a pandas dataframe
df = train_dataset.get_dataframe()

In [0]:
# get the path of a local managed folder where to temporarily save the trained model
mf = dataiku.Folder("cTcsiBqc")
path = mf.get_path()

In [0]:
model_subdir = "my_subdir"
model_dir = os.path.join(path, model_subdir)

In [0]:
from sklearn.linear_model import LogisticRegression

In [0]:
y = df['y']
X = df.drop('y', axis=1)

In [0]:
job_df = pd.get_dummies(df.job, prefix='job')
marital_df = pd.get_dummies(df.marital, prefix='marital')
education_df = pd.get_dummies(df.education, prefix='education')

In [0]:
month_dic = {
    'apr': 5,
    'aug': 8,
    'dec': 12,
    'feb': 2,
    'jan': 1,
    'jul': 7,
    'jun': 6,
    'mar': 3,
    'may': 5,
    'nov': 11,
    'oct': 10,
    'sep': 9}

In [0]:
X.drop(['job', 'marital', 'education'], axis=1, inplace=True)
X = pd.concat([X, job_df, marital_df, education_df], axis=1)
X.month.replace(month_dic, inplace=True)
X.replace({"no": 0, "yes": 1}, inplace=True)
clf = LogisticRegression(random_state=0).fit(X, y)

In [0]:
# Get or create SavedModel
sm_name = "catboost-uci-bank"
sm_id = None
for sm in project.list_saved_models():
    if sm_name != sm["name"]:
        continue
    else:
        sm_id = sm["id"]
        print("Found SavedModel {} with id {}".format(sm_name, sm_id))
        break
        
if sm_id:
    sm = project.get_saved_model(sm_id)
    
else:
    sm = project.create_mlflow_pyfunc_model(name=sm_name,
                                            prediction_type=DSSPredictionMLTaskSettings.PredictionTypes.BINARY)
    sm_id = sm.id
    print("SavedModel not found, created new one with id {}".format(sm_id))


In [0]:
if os.path.exists(model_dir):
    shutil.rmtree(model_dir)

try:
    # ...train your model...
    clf = LogisticRegression(random_state=0).fit(X, y)

    # ...save it with package specific MLflow method (here, SKlearn)...
    mlflow.sklearn.save_model(clf, model_dir)

    # import the model, creating a new version
    mlflow_version = sm.import_mlflow_version_from_managed_folder("v03", "cTcsiBqc", model_subdir, "py36_mlflow")
finally:
    shutil.rmtree(model_dir)

In [0]:
# setting metadata (target name, classes,...)
mlflow_version.set_core_metadata(target_column_name="y",
                             class_labels=["no", "yes"],
                             get_features_from_dataset="eval_data_prepared")

In [0]:
# evaluate the performance of this new version, to populate the performance screens of the saved model version in DSS
mlflow_version.evaluate("eval_data_prepared")