In [1]:
import os
from snowflake.snowpark import Session
from snowflake.snowpark.functions import sproc, servicesproc

In [2]:
connection_parameters = {
    "account": "VUA92284",
    "user": "snowflake_nvidia",
    "password": os.environ['SNOWFLAKE_TEMP_PASSWORD'],
    "role": "SNOWFLAKE_NVIDIA",  # optional
    "warehouse": "SNOWFLAKE_NVIDIA",  # medium snowpark-optimized
    "database": "SNOWFLAKE_NVIDIA",  
    "schema": "PUBLIC",  
  }  

In [3]:
session = Session.builder.configs(connection_parameters).create()

In [9]:
def generate_snowflake_table(session, num_columns, num_rows):
    columns_sql = ',\n'.join(['column{} NUMBER(38,0)'.format(i+1) for i in range(num_columns)])
    create_table_sql = f'CREATE OR REPLACE TABLE snowflake_test_data_{num_columns}_{num_rows} (\n{columns_sql}\n);'
    session.sql(create_table_sql).collect()
    
    insert_values_sql = ',\n'.join(['RANDOM() as column{}'.format(i+1) for i in range(num_columns)])
    insert_sql = f'INSERT INTO snowflake_test_data_{num_columns}_{num_rows}\nSELECT\n{insert_values_sql}\nFROM TABLE(GENERATOR(ROWCOUNT => {num_rows}));'
    session.sql(insert_sql).collect()



In [10]:
cols = [5, 10, 20, 50, 100]
rows = [1_000, 10_000, 100_000, 1_000_000, 10_000_000]

for col in cols:
    for row in rows:
        
        print(row, col)
        generate_snowflake_table(session, col, row)

1000 5
10000 5
100000 5
1000000 5
10000000 5
1000 10
10000 10
100000 10
1000000 10
10000000 10
1000 20
10000 20
100000 20
1000000 20
10000000 20
1000 50
10000 50
100000 50
1000000 50
10000000 50
1000 100
10000 100
100000 100
1000000 100
10000000 100


In [13]:
@sproc(packages=['snowflake-snowpark-python'])
def memory_usage(session: Session) -> str:
    from datetime import datetime

    start = datetime.now()
    df = session.sql('SELECT * FROM SNOWFLAKE_TEST_DATA_100_1000000').to_pandas()
    end = datetime.now()
    
    return str(end-start)+'s '+str(df.memory_usage(deep=True).sum()/1000000000)+'GB'

In [14]:
memory_usage()

'0:00:55.359998s 0.800000128GB'

In [7]:
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python'])
def memory_usage(session: Session) -> str:
    from datetime import datetime

    start = datetime.now()
    df = session.sql('SELECT * FROM SNOWFLAKE_TEST_DATA_100_10000').to_pandas()
    end = datetime.now()
    
    return str(end-start)+'s '+str(df.memory_usage(deep=True).sum()/1000000000)+'GB'

In [None]:
memory_usage()

In [17]:
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python'])
def train_model(session: Session) -> dict:
    from datetime import datetime
    from xgboost import XGBRegressor
    
    cols = 20
    rows = 100000
    
    start = datetime.now()
    df = session.sql(f'SELECT * FROM "SNOWFLAKE_NVIDIA"."PUBLIC"."SNOWFLAKE_TEST_DATA_{cols}_{rows}"').to_pandas()
    end = datetime.now()
    load = end-start
    
    start = datetime.now()
    xgb = XGBRegressor(tree_method='hist')
    model  = xgb.fit(df,df['COLUMN1'])
    end = datetime.now()
    train = end-start
    
    return {
        'rows': str(rows),
        'data_size': str(df.memory_usage(deep=True).sum()/1000000000)+'GB',
        'load': str(load)+'s',
        'train': str(train)+'s'
    }

In [18]:
train_model()

KeyboardInterrupt: 

In [16]:
train_model(session)

{'rows': '100000',
 'data_size': '0.016000128GB',
 'load': '0:00:04.119624s',
 'train': '0:00:00.361934s'}

In [4]:
rows = 10_000_000

In [5]:
@sproc(packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> dict:
    
    import joblib
    from datetime import datetime
    
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    
    from xgboost import XGBRegressor
    from sklearn.ensemble import RandomForestRegressor

    #Load
    start = datetime.now()

    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()
    
    end = datetime.now()
    load = end-start

    #preprocess
    start = datetime.now()
    
    cat_cols = ['CA_ZIP', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline(
        [
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),
            ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols)
        ]
    )

    feature_pipe = Pipeline(
        [
            ('preprocessor', preprocessor), 
        ]
    )

    feature_pipe.fit(train_x, train_y)
    
    end = datetime.now()
    preprocess = end-start

    #train
    start = datetime.now()
    #xgb = XGBRegressor(tree_method='hist')
    model = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=0)

    model.fit(feature_pipe.transform(train_x),train_y)

    end = datetime.now()
    train = end-start
    
    return {
        'rows': str(rows),
        'data_size': str(train_x.memory_usage(deep=True).sum()/1000000000 + train_y.memory_usage(deep=True).sum()/1000000000)+'GB',
        'load': str(load)+'s',
        'preprocess': str(preprocess)+'s',
        'train': str(train)+'s'
    }

The version of package scikit-learn in the local environment is 1.2.1, which does not fit the criteria for the requirement scikit-learn. Your UDF might not work when the package version is different between the server and your local environment
The version of package xgboost in the local environment is 1.7.3, which does not fit the criteria for the requirement xgboost. Your UDF might not work when the package version is different between the server and your local environment


In [6]:
%%time
out = train_model()
print(out)

{
  "data_size": "3.239589311GB",
  "load": "0:00:42.735166s",
  "preprocess": "0:00:18.872349s",
  "rows": "10000000",
  "train": "0:56:00.065744s"
}
CPU times: user 313 ms, sys: 65.7 ms, total: 379 ms
Wall time: 57min 10s


In [7]:
#@sproc(packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
@servicesproc(snowservice='NVIDIA', packages=['snowflake-snowpark-python','scikit-learn', 'xgboost'])
def train_model(session: Session) -> str:
    
    import joblib
    from datetime import datetime
    
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder

    from cuml.preprocessing import StandardScaler, SimpleImputer
    

    from xgboost import XGBRegressor
    from cuml.ensemble import RandomForestRegressor


    #Load
    start = datetime.now()

    snowdf = session.table('"SNOWFLAKE_NVIDIA"."PUBLIC"."TPCDS_SF10TCL_TRAINING"').limit(rows) #62,726,989 / 1.4 GB compressed
    snowdf = snowdf.drop(['CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])

    train_x = snowdf.drop("TOTAL_SALES").to_pandas() # drop labels for training set
    train_y = snowdf.select("TOTAL_SALES").to_pandas()
    
    end = datetime.now()
    load = end-start

    #preprocess
    start = datetime.now()
    
    cat_cols = ['CA_ZIP', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
    num_cols = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']

    num_pipeline = Pipeline(
        [
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),
            ('encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols)
        ]
    )

    feature_pipe = Pipeline(
        [
            ('preprocessor', preprocessor), 
        ]
    )

    feature_pipe.fit(train_x, train_y)
    
    end = datetime.now()
    preprocess = end-start

    #train
    start = datetime.now()
    #model = XGBRegressor(tree_method='gpu_hist')
    model = RandomForestRegressor(random_state=0, n_estimators=100, max_depth=15, )

    model.fit(feature_pipe.transform(train_x),train_y)

    end = datetime.now()
    train = end-start
    
    return {
        'rows': str(rows),
        'data_size': str(train_x.memory_usage(deep=True).sum()/1000000000 + train_y.memory_usage(deep=True).sum()/1000000000)+'GB',
        'load': str(load)+'s',
        'preprocess': str(preprocess)+'s',
        'train': str(train)+'s'
    }

In [8]:
%%time
out = train_model()
print(out['result'])

KeyboardInterrupt: 