In [5]:
import os
from snowflake.snowpark import Session
from snowflake.snowpark.functions import sproc, servicesproc

In [6]:
connection_parameters = {
    "account": "VUA92284",
    "user": "snowflake_nvidia",
    "password": os.environ['SNOWFLAKE_TEMP_PASSWORD'],
    "role": "SNOWFLAKE_NVIDIA",  # optional
    "warehouse": "SNOWFLAKE_NVIDIA",  # medium snowpark-optimized
    "database": "SNOWFLAKE_NVIDIA",  
    "schema": "PUBLIC",  
  }  

In [7]:
session = Session.builder.configs(connection_parameters).create()

In [8]:
rows = 10_000

In [9]:
import joblib
from datetime import datetime

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from xgboost import XGBRegressor


start = datetime.now()

snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
train_y = snowdf.select("TOTAL_SALES").to_pandas()

cat_cols = ['CA_ZIP', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

num_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

feature_pipe = Pipeline(
    [
        ('preprocessor', preprocessor), 
    ]
)

feature_pipe.fit(train_x, train_y)

xgb = XGBRegressor(tree_method='hist')

xgb.fit(feature_pipe.transform(train_x),train_y)

end = datetime.now()

In [13]:
xgb.save_model('../triton/models/xgboost/1/xgboost.txt')
#xgb.save_model('../triton/models/xgboost/1/xgboost.json')

In [24]:
joblib.dump(feature_pipe, '../triton/models/pipeline/1/feature_pipeline_nvidia.joblib')

['../triton/models/pipeline/1/feature_pipeline_nvidia.joblib']

In [26]:
type(joblib.load('../triton/models/pipeline/1/feature_pipeline_nvidia.joblib'))

sklearn.pipeline.Pipeline

In [9]:
features = train_x.shape[1]
num_classes = 1
bytes_per_sample = (features + num_classes) * 4
max_batch_size = 60_000_000 // bytes_per_sample

In [12]:
config_text = f"""backend: "fil"
max_batch_size: {max_batch_size}
input [                                 
 {{  
    name: "input__0"
    data_type: TYPE_FP32
    dims: [ {features} ]                    
  }} 
]
output [
 {{
    name: "output__0"
    data_type: TYPE_FP32
    dims: [ {num_classes} ]
  }}
]
instance_group [{{ kind: KIND_CPU }}]
parameters [
  {{
    key: "model_type"
    value: {{ string_value: "xgboost_json" }}
  }},
  {{
    key: "predict_proba"
    value: {{ string_value: "false" }}
  }},
  {{
    key: "output_class"
    value: {{ string_value: "true" }}
  }},
  {{
    key: "threshold"
    value: {{ string_value: "0.5" }}
  }},
  {{
    key: "storage_type"
    value: {{ string_value: "AUTO" }}
  }}
]

dynamic_batching {{ }}"""

with open('../triton/triton-model-repo/xgboost/config.pbtxt', 'w') as file_:
    file_.write(config_text)

In [12]:
TRITON_IMAGE = 'nvcr.io/nvidia/tritonserver:22.12-py3'
REPO_PATH = '/Users/madkins/Documents/snowservices/nvidia/triton/models'

print(f'''
docker run -d -p 8000:8000 -p 8001:8001 -p 8002:8002 -v {REPO_PATH}:/models {TRITON_IMAGE} --name triton tritonserver --model-repository=/models
   ''')             
                
                


docker run -d -p 8000:8000 -p 8001:8001 -p 8002:8002 -v /Users/madkins/Documents/snowservices/nvidia/triton/models:/models nvcr.io/nvidia/tritonserver:22.12-py3 --name triton tritonserver --model-repository=/models
   


In [14]:
import time
import tritonclient.grpc as triton_grpc
from tritonclient import utils as triton_utils
HOST = 'localhost'
PORT = 8001
TIMEOUT = 60

In [20]:
!pip freeze

aiohttp==3.8.3
aiosignal==1.3.1
anyio==3.6.2
appnope==0.1.3
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
asn1crypto==1.5.1
asttokens==2.2.1
async-timeout==4.0.2
attrs==22.2.0
backcall==0.2.0
beautifulsoup4==4.11.1
bleach==5.0.1
Brotli==1.0.9
certifi==2022.12.7
cffi==1.15.1
charset-normalizer==2.1.1
click==8.1.3
cloudpickle==2.0.0
comm==0.1.2
crcmod==1.7
cryptography==38.0.4
debugpy==1.6.5
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.1.1
docopt==0.6.2
entrypoints==0.4
executing==1.2.0
fastapi==0.89.1
fastavro==1.7.0
fasteners==0.18
fastjsonschema==2.16.2
filelock==3.9.0
fqdn==1.5.1
frozenlist==1.3.3
gevent==22.10.2
geventhttpclient==2.0.2
greenlet==2.0.1
grpcio==1.42.0
h11==0.14.0
hdfs==2.7.0
httplib2==0.20.4
idna==3.4
importlib-metadata==6.0.0
importlib-resources==5.10.2
ipykernel==6.20.2
ipython==8.8.0
ipython-genutils==0.2.0
ipywidgets==8.0.4
isoduration==20.11.0
jedi==0.18.2
Jinja2==3.1.2
joblib==1.2.0
jsonpoint

In [6]:
@sproc(packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    import joblib
    from datetime import datetime
    
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler
    
    from xgboost import XGBRegressor

    
    start = datetime.now()

    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()

    cat_cols = ['CA_ZIP', 'CD_GENDER', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline(
        [
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),
        ]
    )

    pipe = Pipeline(
        [
            ('preprocessor', preprocessor), 
            ('xgboost', XGBRegressor(tree_method='hist', enable_categorical=True))
        ]
    )
    
    pipe.fit(train_x, train_y)

    end = datetime.now()

    #test_preds = pipe.predict(test_x)
    #rmse = mean_squared_error(test_y, test_preds)
    #model_file = os.path.join('/tmp', 'model.joblib')
    #joblib.dump(pipe, model_file)
    #session.file.put(model_file, "@ml_models",overwrite=True)
    return str(end-start)+'s'

The version of package scikit-learn in the local environment is 1.2.1, which does not fit the criteria for the requirement scikit-learn. Your UDF might not work when the package version is different between the server and your local environment
The version of package xgboost in the local environment is 1.7.3, which does not fit the criteria for the requirement xgboost. Your UDF might not work when the package version is different between the server and your local environment


In [7]:
%%time
train_model()

CPU times: user 27.4 ms, sys: 9.53 ms, total: 37 ms
Wall time: 4min 51s


'0:04:49.208784s'

Unnamed: 0,C_BIRTH_YEAR,CD_DEP_COUNT,CA_ZIP,CD_GENDER,CD_MARITAL_STATUS,CD_CREDIT_RATING,CD_EDUCATION_STATUS
0,-1.626503,0.0,36192,M,W,Low Risk,College
1,-1.271713,0.0,69583,M,W,Low Risk,College
2,0.755656,0.0,28014,M,W,Low Risk,College
3,-1.525134,0.0,28059,M,W,Low Risk,College
4,0.096761,0.0,49237,M,W,Low Risk,College
...,...,...,...,...,...,...,...
9995,0.907709,0.0,18059,M,W,Low Risk,College
9996,0.857025,0.0,34536,M,W,Low Risk,College
9997,-1.373082,0.0,78877,M,W,Low Risk,College
9998,-1.322397,0.0,71521,M,W,Low Risk,College


In [13]:
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    import joblib
    from datetime import datetime
    
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder

    from cuml.preprocessing import StandardScaler, SimpleImputer

    from xgboost import XGBRegressor


    start = datetime.now()

    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()

    cat_cols = ['CA_ZIP', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline(
        [
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),
            ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols)
        ]
    )

    feature_pipe = Pipeline(
        [
            ('preprocessor', preprocessor), 
        ]
    )

    feature_pipe.fit(train_x, train_y)

    xgb = XGBRegressor(tree_method='gpu_hist')

    xgb.fit(feature_pipe.transform(train_x),train_y)

    end = datetime.now()

    return str(end-start)+'s'

In [14]:
%%time
train_model()

CPU times: user 5.3 ms, sys: 2.76 ms, total: 8.07 ms
Wall time: 34.6 s


{'result': '0:00:33.870573s'}