In [1]:
import os
from snowflake.snowpark import Session
from snowflake.snowpark.functions import sproc, servicesproc

In [2]:
connection_parameters = {
    "account": "VUA92284",
    "user": "snowflake_nvidia",
    "password": os.environ['SNOWFLAKE_TEMP_PASSWORD'],
    "role": "SNOWFLAKE_NVIDIA",  # optional
    "warehouse": "SNOWFLAKE_NVIDIA",  # medium snowpark-optimized
    "database": "SNOWFLAKE_NVIDIA",  
    "schema": "PUBLIC",  
  }  

In [3]:
session = Session.builder.configs(connection_parameters).create()

In [4]:
rows = 10_000_000

In [9]:
@sproc(packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    import joblib
    from datetime import datetime
    
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder, SimpleImputer
    
    from xgboost import XGBRegressor

    start = datetime.now()

    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()

    cat_cols = ['CA_ZIP', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline(
        [
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),
            ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols)
        ]
    )

    feature_pipe = Pipeline(
        [
            ('preprocessor', preprocessor), 
        ]
    )

    feature_pipe.fit(train_x, train_y)

    xgb = XGBRegressor(tree_method='hist')

    xgb.fit(feature_pipe.transform(train_x),train_y)

    end = datetime.now()

    return str(end-start)+'s'

The version of package scikit-learn in the local environment is 1.2.1, which does not fit the criteria for the requirement scikit-learn. Your UDF might not work when the package version is different between the server and your local environment
The version of package xgboost in the local environment is 1.7.3, which does not fit the criteria for the requirement xgboost. Your UDF might not work when the package version is different between the server and your local environment


In [10]:
%%time
train_model()

Failed to execute query [queryID: 01aa29ef-0402-812f-002a-c9030e10704e] CALL "SNOWFLAKE_NVIDIA"."PUBLIC".SNOWPARK_TEMP_PROCEDURE_X0EOUT2I2N()
100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "_udf_code.py", line 7, in compute
  File "/var/folders/8v/b1vh6rq55zq1b4bwqr_gxm3m0000gn/T/ipykernel_84162/3087821579.py", line 9, in train_model
ImportError: cannot import name 'SimpleImputer' from 'sklearn.preprocessing' (/usr/lib/python_udf/c75f666b5683e2febf86552d7f5dd4b652525a34f7845f2b2a1aa91e95fe51ca/lib/python3.8/site-packages/sklearn/preprocessing/__init__.py)
 in function SNOWPARK_TEMP_PROCEDURE_X0EOUT2I2N with handler compute


SnowparkSQLException: (1304): 01aa29ef-0402-812f-002a-c9030e10704e: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "_udf_code.py", line 7, in compute
  File "/var/folders/8v/b1vh6rq55zq1b4bwqr_gxm3m0000gn/T/ipykernel_84162/3087821579.py", line 9, in train_model
ImportError: cannot import name 'SimpleImputer' from 'sklearn.preprocessing' (/usr/lib/python_udf/c75f666b5683e2febf86552d7f5dd4b652525a34f7845f2b2a1aa91e95fe51ca/lib/python3.8/site-packages/sklearn/preprocessing/__init__.py)
 in function SNOWPARK_TEMP_PROCEDURE_X0EOUT2I2N with handler compute

In [11]:
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    import joblib
    from datetime import datetime
    
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder

    from cuml.preprocessing import StandardScaler, SimpleImputer

    from xgboost import XGBRegressor


    start = datetime.now()

    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()

    cat_cols = ['CA_ZIP', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline(
        [
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),
            ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols)
        ]
    )

    feature_pipe = Pipeline(
        [
            ('preprocessor', preprocessor), 
        ]
    )

    feature_pipe.fit(train_x, train_y)

    xgb = XGBRegressor(tree_method='gpu_hist')

    xgb.fit(feature_pipe.transform(train_x),train_y)

    end = datetime.now()

    return str(end-start)+'s'

In [8]:
%%time
train_model()

CPU times: user 5.81 ms, sys: 3.11 ms, total: 8.92 ms
Wall time: 35.6 s


{'result': '0:00:34.919994s'}