In [1]:
import os
from snowflake.snowpark import Session
from snowflake.snowpark.functions import sproc, servicesproc

In [2]:
connection_parameters = {
    "account": "VUA92284",
    "user": "snowflake_nvidia",
    "password": os.environ['SNOWFLAKE_TEMP_PASSWORD'],
    "role": "SNOWFLAKE_NVIDIA",  # optional
    "warehouse": "SNOWFLAKE_NVIDIA",  # medium snowpark-optimized
    "database": "SNOWFLAKE_NVIDIA",  
    "schema": "PUBLIC",  
  }  

In [3]:
session = Session.builder.configs(connection_parameters).create()

In [11]:
rows = 50_000_000

In [3]:
import os


'July2892!'

In [12]:
@sproc(packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    import joblib
    from datetime import datetime
    
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    
    from xgboost import XGBRegressor

    start = datetime.now()

    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()

    cat_cols = ['CA_ZIP', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline(
        [
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),
            ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols)
        ]
    )

    feature_pipe = Pipeline(
        [
            ('preprocessor', preprocessor), 
        ]
    )

    feature_pipe.fit(train_x, train_y)

    xgb = XGBRegressor(tree_method='hist')

    xgb.fit(feature_pipe.transform(train_x),train_y)

    end = datetime.now()

    return str(end-start)+'s'

The version of package xgboost in the local environment is 1.7.4, which does not fit the criteria for the requirement xgboost. Your UDF might not work when the package version is different between the server and your local environment


In [13]:
%%time
train_model()

CPU times: user 37.2 ms, sys: 10.9 ms, total: 48.2 ms
Wall time: 4min 41s


'0:04:39.379196s'

In [7]:
@servicesproc(snowservice='GPU', packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    import joblib
    from datetime import datetime
    
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder

    from cuml.preprocessing import StandardScaler, SimpleImputer

    from xgboost import XGBRegressor


    start = datetime.now()

    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()

    cat_cols = ['CA_ZIP', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline(
        [
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),
            ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols)
        ]
    )

    feature_pipe = Pipeline(
        [
            ('preprocessor', preprocessor), 
        ]
    )

    feature_pipe.fit(train_x, train_y)

    xgb = XGBRegressor(tree_method='gpu_hist')

    xgb.fit(feature_pipe.transform(train_x),train_y)

    end = datetime.now()

    return str(end-start)+'s'

In [8]:
%%time
train_model()

CPU times: user 7.04 ms, sys: 2.49 ms, total: 9.52 ms
Wall time: 52.8 s


{'result': '0:00:50.143147s'}