In [1]:
import os
from snowflake.snowpark import Session
from snowflake.snowpark.functions import sproc, servicesproc

In [2]:
connection_parameters = {
    "account": "VUA92284",
    "user": "snowflake_nvidia",
    "password": os.environ['SNOWFLAKE_TEMP_PASSWORD'],
    "role": "SNOWFLAKE_NVIDIA",  # optional
    "warehouse": "SNOWFLAKE_NVIDIA",  # medium snowpark-optimized
    "database": "SNOWFLAKE_NVIDIA",  
    "schema": "PUBLIC",  
  }  

In [3]:
session = Session.builder.configs(connection_parameters).create()

In [4]:
rows = 1_000_000

In [5]:
#@sproc(packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    import joblib
    from datetime import datetime
    
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    
    
    #from sklearn.impute import SimpleImputer
    #from sklearn.preprocessing import StandardScaler
    from cuml.preprocessing import StandardScaler, SimpleImputer
    
    from xgboost import XGBRegressor

    start = datetime.now()

    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()

    cat_cols = ['CA_ZIP', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline(
        [
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),
            ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols)
        ]
    )

    feature_pipe = Pipeline(
        [
            ('preprocessor', preprocessor), 
        ]
    )

    feature_pipe.fit(train_x, train_y)

    #xgb = XGBRegressor(tree_method='hist')
    xgb = XGBRegressor(tree_method='gpu_hist')

    xgb.fit(feature_pipe.transform(train_x),train_y)

    end = datetime.now()

    return str(end-start)+'s'

In [6]:
%%time
train_model()

CPU times: user 4.56 ms, sys: 2 ms, total: 6.56 ms
Wall time: 9.19 s


{'result': '0:00:08.758439s'}

In [None]:
@sproc(packages=['snowflake-snowpark-python'])
def memory_usage(session: Session) -> str:
    from datetime import datetime

    start = datetime.now()
    df = session.sql('SELECT * FROM SNOWFLAKE_TEST_DATA_100_10000').to_pandas()
    end = datetime.now()
    
    return str(end-start)+'s '+str(df.memory_usage(deep=True).sum()/1000000000)+'GB'

In [5]:
memory_usage()

'0:00:02.191049s 0.008000128GB'

In [6]:
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python'])
def memory_usage(session: Session) -> str:
    from datetime import datetime

    start = datetime.now()
    df = session.sql('SELECT * FROM SNOWFLAKE_TEST_DATA_100_10000').to_pandas()
    end = datetime.now()
    
    return str(end-start)+'s '+str(df.memory_usage(deep=True).sum()/1000000000)+'GB'

In [7]:
memory_usage()

{'result': '0:00:00.521847s 0.008000128GB'}

In [12]:
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python'])
def train_model_cpu(session: Session) -> dict:
    from datetime import datetime
    from xgboost import XGBRegressor
    
    cols = 100
    rows = 10_000
    opt='hist'
    
    start = datetime.now()
    df = session.sql(f'SELECT * FROM "SNOWFLAKE_NVIDIA"."PUBLIC"."SNOWFLAKE_TEST_DATA_{cols}_{rows}"').to_pandas()
    end = datetime.now()
    load = end-start
    
    start = datetime.now()
    xgb = XGBRegressor(tree_method=opt, n_jobs=10)
    model  = xgb.fit(df,df['COLUMN1'])
    end = datetime.now()
    train = end-start
    
    return {
        'opt': opt,
        'rows': str(rows),
        'cols': str(cols),
        'data_size': str(df.memory_usage(deep=True).sum()/1000000000)+'GB',
        'load': str(load)+'s',
        'train': str(train)+'s'
    }

In [13]:
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python'])
def train_model_gpu(session: Session) -> dict:
    from datetime import datetime
    from xgboost import XGBRegressor
    
    cols = 100
    rows = 10_000
    opt='gpu_hist'
    
    start = datetime.now()
    df = session.sql(f'SELECT * FROM "SNOWFLAKE_NVIDIA"."PUBLIC"."SNOWFLAKE_TEST_DATA_{cols}_{rows}"').to_pandas()
    end = datetime.now()
    load = end-start
    
    start = datetime.now()
    xgb = XGBRegressor(tree_method=opt, n_jobs=10)
    model  = xgb.fit(df,df['COLUMN1'])
    end = datetime.now()
    train = end-start
    
    return {
        'opt': opt,
        'rows': str(rows),
        'cols': str(cols),
        'data_size': str(df.memory_usage(deep=True).sum()/1000000000)+'GB',
        'load': str(load)+'s',
        'train': str(train)+'s'
    }

In [14]:
%%time
train_model_cpu()

CPU times: user 6.57 ms, sys: 6.47 ms, total: 13 ms
Wall time: 2.33 s


{'result': {'opt': 'hist',
  'rows': '10000',
  'cols': '100',
  'data_size': '0.008000128GB',
  'load': '0:00:00.978266s',
  'train': '0:00:00.808715s'}}

In [15]:
%%time
train_model_gpu()

CPU times: user 6.45 ms, sys: 2.49 ms, total: 8.93 ms
Wall time: 1.31 s


{'result': {'opt': 'gpu_hist',
  'rows': '10000',
  'cols': '100',
  'data_size': '0.008000128GB',
  'load': '0:00:00.301539s',
  'train': '0:00:00.697223s'}}