In [1]:
import os
from snowflake.snowpark import Session
from snowflake.snowpark.functions import sproc, servicesproc

In [2]:
connection_parameters = {
    "account": "VUA92284",
    "user": "snowflake_nvidia",
    "password": os.environ['SNOWFLAKE_TEMP_PASSWORD'],
    "role": "SNOWFLAKE_NVIDIA",  # optional
    "warehouse": "SNOWFLAKE_NVIDIA",  # medium snowpark-optimized
    "database": "SNOWFLAKE_NVIDIA",  
    "schema": "PUBLIC",  
  }  

In [3]:
session = Session.builder.configs(connection_parameters).create()

In [4]:
rows = 10_000

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
import joblib
import os
from datetime import datetime

start = datetime.now()

snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
train_y = snowdf.select("TOTAL_SALES").to_pandas()

print(str(train_x.memory_usage(deep=True, index=True).sum()/1_000_000_000) + 'GB') #GB

cat_cols = ['CA_ZIP', 'CD_GENDER', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

preprocessor = ColumnTransformer(
transformers=[('num', num_pipeline, num_cols),
              ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols) ])

pipe = Pipeline([('preprocessor', preprocessor), 
                    ('xgboost', XGBRegressor())])
pipe.fit(train_x, train_y)

end = datetime.now()

joblib.dump(pipe, '../triton/triton-model-repo/model.joblib')
#session.file.put(model_file, "@ml_models",overwrite=True)
str(end-start)+'s'

0.003143442GB


'0:00:01.610848s'

In [None]:
def generate_config(model_dir, deployment_type='gpu', storage_type='AUTO'):
    if deployment_type.lower() == 'cpu':
        instance_kind = 'KIND_CPU'
    else:
        instance_kind = 'KIND_GPU'

    config_text = f"""backend: "fil"
max_batch_size: {max_batch_size}
input [                                 
 {{  
    name: "input__0"
    data_type: TYPE_FP32
    dims: [ {features} ]                    
  }} 
]
output [
 {{
    name: "output__0"
    data_type: TYPE_FP32
    dims: [ {num_classes} ]
  }}
]
instance_group [{{ kind: {instance_kind} }}]
parameters [
  {{
    key: "model_type"
    value: {{ string_value: "xgboost_json" }}
  }},
  {{
    key: "predict_proba"
    value: {{ string_value: "hrΩtrue" }}
  }},
  {{
    key: "output_class"
    value: {{ string_value: "true" }}
  }},
  {{
    key: "threshold"
    value: {{ string_value: "0.5" }}
  }},
  {{
    key: "storage_type"
    value: {{ string_value: "{storage_type}" }}
  }}
]

dynamic_batching {{
  max_queue_delay_microseconds: 100
}}"""
    config_path = os.path.join(model_dir, 'config.pbtxt')
    with open(config_path, 'w') as file_:
        file_.write(config_text)

    return config_pathgwy4e

In [11]:
!ls

Dockerfile    demo.ipynb    model.joblib
[34m__pycache__[m[m   manifest.yaml server.py


In [6]:
@sproc(packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
    from sklearn.metrics import mean_squared_error
    from sklearn.compose import ColumnTransformer
    from xgboost import XGBRegressor
    import joblib
    import os
    from datetime import datetime
    
    start = datetime.now()
    
    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()

    cat_cols = ['CA_ZIP', 'CD_GENDER', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ])

    preprocessor = ColumnTransformer(
    transformers=[('num', num_pipeline, num_cols),
                  ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols) ])

    pipe = Pipeline([('preprocessor', preprocessor), 
                        ('xgboost', XGBRegressor())])
    pipe.fit(train_x, train_y)

    end = datetime.now()

    #test_preds = pipe.predict(test_x)
    #rmse = mean_squared_error(test_y, test_preds)
    #model_file = os.path.join('/tmp', 'model.joblib')
    #joblib.dump(pipe, model_file)
    #session.file.put(model_file, "@ml_models",overwrite=True)
    return str(end-start)+'s'

The version of package scikit-learn in the local environment is 1.2.1, which does not fit the criteria for the requirement scikit-learn. Your UDF might not work when the package version is different between the server and your local environment
The version of package xgboost in the local environment is 1.7.3, which does not fit the criteria for the requirement xgboost. Your UDF might not work when the package version is different between the server and your local environment


In [7]:
%%time
train_model()

CPU times: user 27.4 ms, sys: 9.53 ms, total: 37 ms
Wall time: 4min 51s


'0:04:49.208784s'

In [24]:
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
    from sklearn.metrics import mean_squared_error
    from sklearn.compose import ColumnTransformer
    from xgboost import XGBRegressor
    import joblib
    import os
    from datetime import datetime
    
    start = datetime.now()
    
    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()

    cat_cols = ['CA_ZIP', 'CD_GENDER', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ])

    preprocessor = ColumnTransformer(
    transformers=[('num', num_pipeline, num_cols),
                  ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols) ], verbose=True)

    pipe = Pipeline([('preprocessor', preprocessor), 
                        ('xgboost', XGBRegressor())], verbose=True)
                        #('xgboost', XGBRegressor(tree_method='gpu_hist', gpu_id=0))])
    pipe.fit(train_x, train_y)
    
    end = datetime.now()

    #test_preds = pipe.predict(test_x)
    #rmse = mean_squared_error(test_y, test_preds)
    #model_file = os.path.join('/tmp', 'model.joblib')
    #joblib.dump(pipe, model_file)
    #session.file.put(model_file, "@ml_models",overwrite=True)
    return str(end-start)+'s'

In [25]:
%%time
train_model()

CPU times: user 5.92 ms, sys: 2.77 ms, total: 8.69 ms
Wall time: 1min 29s


{'result': '0:01:29.022231s'}

In [22]:
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    from sklearn.pipeline import Pipeline
    # from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OneHotEncoder
    from cuml.preprocessing import StandardScaler, SimpleImputer, MinMaxScaler #OneHotEncoder,

    from sklearn.metrics import mean_squared_error
    from sklearn.compose import ColumnTransformer
    from xgboost import XGBRegressor
    import joblib
    import os
    from datetime import datetime

    start = datetime.now()

    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()

    cat_cols = ['CA_ZIP', 'CD_GENDER', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ])

    preprocessor = ColumnTransformer(
    transformers=[('num', num_pipeline, num_cols),
                  ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols) ], verbose=True)

    pipe = Pipeline([('preprocessor', preprocessor), 
                        #('xgboost', XGBRegressor())])
                        ('xgboost', XGBRegressor(tree_method='gpu_hist'))], verbose=True)
    pipe.fit(train_x, train_y)

    end = datetime.now()

    #test_preds = pipe.predict(test_x)
    #rmse = mean_squared_error(test_y, test_preds)
    #model_file = os.path.join('/tmp', 'model.joblib')
    #joblib.dump(pipe, model_file)
    #session.file.put(model_file, "@ml_models",overwrite=True)
    return str(end-start)+'s'

In [23]:
%%time
train_model()

CPU times: user 6.66 ms, sys: 2.89 ms, total: 9.55 ms
Wall time: 33.6 s


{'result': '0:00:32.629955s'}

In [37]:
payload = {'data': [[0, 1001, 's'], [1, 1002, 's'], [2, 1003, 's'], [3, 1004, 's'], [4, 1005, 's']]}

In [27]:
import pandas as pd

In [34]:
df = pd.DataFrame(payload['data'])

In [35]:
df[[0,1]].values.tolist()

[[0, 1001], [1, 1002], [2, 1003], [3, 1004], [4, 1005]]

In [36]:
{'data': df[[0,1]].values.tolist()}

{'data': [[0, 1001], [1, 1002], [2, 1003], [3, 1004], [4, 1005]]}

In [14]:
#no cold start

In [15]:
#ram = snowdf.memory_usage(deep=True, index=True).sum()/1_000_000_000 #GB