In [2]:
import json
from snowflake.snowpark import Session
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend

with open('../../creds.json') as f:
    data = json.load(f)
    USERNAME = data['user']
    SF_ACCOUNT = data['account']
    SF_WH = data['warehouse']
    passphrase = data['passphrase']

# Read the private key from the .p8 file
with open('../../rsa_key.p8', 'rb') as key_file:
    private_key = key_file.read()

# If the private key is encrypted, load it with a passphrase
# Replace 'your_key_passphrase' with your actual passphrase if needed
private_key_obj = serialization.load_pem_private_key(
    private_key,
    password=passphrase.encode() if passphrase else None,
    backend=default_backend()
)

# Define connection parameters including the private key
CONNECTION_PARAMETERS = {
    'user': USERNAME,
    'account': SF_ACCOUNT,
    'private_key': private_key_obj,
    'warehouse': SF_WH,
}

# Create a session with the specified connection parameters
session = Session.builder.configs(CONNECTION_PARAMETERS).create()

from snowflake.core.warehouse import Warehouse
from snowflake.core import Root
root = Root(session)
from snowflake.snowpark.functions import col 
from time import time

In [1]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
#session = get_active_session()


SnowparkSessionException: (1403): No default Session is found. Please create a session before you call function 'udf' or use decorator '@udf'.

In [3]:
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.preprocessing import KBinsDiscretizer, OneHotEncoder
from snowflake.ml.modeling.impute import SimpleImputer

from snowflake.ml.modeling.compose import ColumnTransformer
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import StandardScaler, OrdinalEncoder
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.xgboost import XGBRegressor

import snowflake.snowpark.functions as F

In [4]:
#!pip install xgboost==2.0.3

from time import time
import random

In [5]:
snowdf = session.table("tpcds_xgboost.demo.feature_store")
snowdf = snowdf.drop(['CA_ZIP','CUSTOMER_SK', 'C_CURRENT_HDEMO_SK', 'C_CURRENT_ADDR_SK', 'C_CUSTOMER_ID', 'CA_ADDRESS_SK', 'CD_DEMO_SK'])
snowdf.show()

------------------------------------------------------------------------------------------------------------------------------------
|"TOTAL_SALES"  |"C_BIRTH_YEAR"  |"CD_GENDER"  |"CD_MARITAL_STATUS"  |"CD_CREDIT_RATING"  |"CD_EDUCATION_STATUS"  |"CD_DEP_COUNT"  |
------------------------------------------------------------------------------------------------------------------------------------
|30368.88       |1972            |M            |U                    |Good                |4 yr Degree            |0               |
|39077.29       |1976            |M            |U                    |Good                |4 yr Degree            |0               |
|33980.60       |1976            |M            |U                    |Good                |4 yr Degree            |0               |
|40226.64       |1949            |M            |U                    |Good                |4 yr Degree            |0               |
|25446.41       |1952            |M            |U                    

In [6]:
## Dropping any null values
from snowflake.snowpark.functions import col, is_null

# Create a filter condition for non-finite values across all columns
non_finite_filter = None

# Iterate over all columns and update the filter condition
for column in snowdf.columns:
    current_filter = is_null(col(column))
    non_finite_filter = current_filter if non_finite_filter is None else (non_finite_filter | current_filter)

# Apply the filter to the DataFrame to exclude rows with any non-finite values
df_filtered = snowdf.filter(~non_finite_filter)


## Clean up cats
def fix_values(columnn):
    return F.upper(F.regexp_replace(F.col(columnn), '[^a-zA-Z0-9]+', '_'))
categorical_cols = ['CD_GENDER', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
for col in categorical_cols:
    df_filtered = df_filtered.with_column(col, fix_values(col))
    

In [7]:
feature_cols = df_filtered.columns
feature_cols.remove('TOTAL_SALES')
target_col = 'TOTAL_SALES'

snowdf_train, snowdf_test = df_filtered.random_split([0.8, 0.2], seed=82) 
snowdf_train=snowdf_train.limit(1_000)
snowdf_train.count()

1000

In [10]:
 ## Distributed Preprocessing - 25X to 50X faster

numeric_features = ['C_BIRTH_YEAR', 'CD_DEP_COUNT']
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])

categorical_cols = ['CD_GENDER', 'CD_MARITAL_STATUS', 'CD_CREDIT_RATING', 'CD_EDUCATION_STATUS']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_cols)
        ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', XGBRegressor())])

In [11]:
 ## Distributed HyperParameter Optimization
hyper_param = dict(
        model__max_depth=[2,4],
        model__learning_rate=[0.1,0.3],
    )

xg_model = GridSearchCV(
    estimator=pipeline,
    param_grid=hyper_param,
    #cv=5,
    input_cols=numeric_features + categorical_cols,
    label_cols=['TOTAL_SALES'],
    output_cols=["TOTAL_SALES_PREDICT"],
    #verbose=4  ##verbose not working
)

# Fit and Score
xg_model.fit(snowdf_train)
##Takes 25 seconds

Package 'fastparquet' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.2, which does not fit the criteria for the requirement 'cachetools<6'. Your UDF might not work when the package version is different between the server and your local environment.


<snowflake.ml.modeling.model_selection.grid_search_cv.GridSearchCV at 0x169d007d0>

In [None]:
session.sql('ALTER SESSION SET USE_CACHED_RESULT=FALSE')

In [None]:
lengths = [1_000_000,5_000_000,10_000_000,25_000_000,50_000_000]
#lengths = [10_000_000,25_000_000,50_000_000]
random.seed(9001)

for i, length in enumerate(lengths):
    seedv = random.randint(1, 1000)
    snowdf_train, snowdf_test = df_filtered.random_split([0.8, 0.2], seed=seedv)  #82
    snowdf_train=snowdf_train.limit(length)
    print (snowdf_train.count())
    init = time()
    xg_model.fit(snowdf_train)
    total_time = (time() - init) / 60
    print(f'total rows: {length} total time: {total_time} seed: {seedv}')
    snowdf_train = session.create_dataframe([1, 2, 3, 4]).to_df("a")
    snowdf_train.show()

In [13]:
# An evaluation loop to see how the model does at the different series lengths
lengths = [10_000, 50_000, 100_000, 500_000, 1_000_000, 2_000_000]
#lengths = [1_000]
lengths = [1_000_000, 25_000_000]
TEST_ID = 2
train_df = session.table('TPCDS_XGBOOST.DEMO.SERIES2M')
print(session.get_current_warehouse())
model_name = "xgboost_approx"
dataset_name = "TPCDS"

WAREHOUSE_SIZES = [
    # "XSMALL",
   # "SMALL",
    "MEDIUM",
    "LARGE",
    "XLARGE",
  "XXLARGE",
]

for whouse in WAREHOUSE_SIZES:  
  print ("build warehouse ", whouse)
  wh_size = whouse
  wh_type = "SNOWPARK-OPTIMIZED"
  #wh_type = "STANDARD"

  session.sql(
  f"""CREATE OR REPLACE WAREHOUSE {"snowpark_opt_wh"}
          WITH
              WAREHOUSE_SIZE= '{wh_size}'
              WAREHOUSE_TYPE = '{wh_type}'
              AUTO_SUSPEND = 60
              AUTO_RESUME = TRUE
              INITIALLY_SUSPENDED = FALSE
              MAX_CONCURRENCY_LEVEL = 1
              MIN_CLUSTER_COUNT = 1
              MAX_CLUSTER_COUNT = 1
  """
  ).collect()

  for length in lengths:
    print ("prepping data for ", length)
    print(session.get_current_warehouse())
    seedv = random.randint(1, 1000)
    snowdf_train, snowdf_test = df_filtered.random_split([0.8, 0.2], seed=seedv)  #82
    snowdf_train=snowdf_train.limit(length)
    print (snowdf_train.count())
    print ("starting training")
    init = time()
    # Run the regression model
    xg_model.fit(snowdf_train)
    total_time = (time() - init) / 60
    print(f'total rows: {length} total time: {total_time} seed: {seedv}')
    query_id = session.sql("SELECT LAST_QUERY_ID()").collect()[0].as_dict()["LAST_QUERY_ID()"]
    query = f"""
    INSERT INTO TPCDS_XGBOOST.DEMO.RESULTS VALUES (
      '{TEST_ID}',
      '{model_name}',
      '{query_id}',
      '{length}',
      '{total_time}',
      '{dataset_name}',
      '{wh_size}',
      '{wh_type}'
      )
    """
    session.sql(query).collect()

"SNOWPARK_OPT_WH"
build warehouse  MEDIUM
prepping data for  1000000
"SNOWPARK_OPT_WH"
1000000
starting training


  dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(self.dataset)
Package 'snowflake-telemetry-python' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.


total rows: 1000000 total time: 1.5104982813199361 seed: 830
prepping data for  25000000
"SNOWPARK_OPT_WH"
25000000
starting training


  dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(self.dataset)
Package 'snowflake-telemetry-python' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.


total rows: 25000000 total time: 9.28427733182907 seed: 386
build warehouse  LARGE
prepping data for  1000000
"SNOWPARK_OPT_WH"
1000000
starting training


Package 'fastparquet' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.2, which does not fit the criteria for the requirement 'cachetools<6'. Your UDF might not work when the package version is different between the server and your local environment.


total rows: 1000000 total time: 2.9549667199452716 seed: 157
prepping data for  25000000
"SNOWPARK_OPT_WH"
25000000
starting training


Package 'fastparquet' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.2, which does not fit the criteria for the requirement 'cachetools<6'. Your UDF might not work when the package version is different between the server and your local environment.


total rows: 25000000 total time: 16.84705551067988 seed: 100
build warehouse  XLARGE
prepping data for  1000000
"SNOWPARK_OPT_WH"
1000000
starting training


Package 'fastparquet' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.2, which does not fit the criteria for the requirement 'cachetools<6'. Your UDF might not work when the package version is different between the server and your local environment.


total rows: 1000000 total time: 3.029693067073822 seed: 920
prepping data for  25000000
"SNOWPARK_OPT_WH"
25000000
starting training


Package 'fastparquet' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.2, which does not fit the criteria for the requirement 'cachetools<6'. Your UDF might not work when the package version is different between the server and your local environment.


total rows: 25000000 total time: 16.91207328637441 seed: 735
build warehouse  XXLARGE
prepping data for  1000000
"SNOWPARK_OPT_WH"
1000000
starting training


Package 'fastparquet' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.2, which does not fit the criteria for the requirement 'cachetools<6'. Your UDF might not work when the package version is different between the server and your local environment.


total rows: 1000000 total time: 3.0922807335853575 seed: 353
prepping data for  25000000
"SNOWPARK_OPT_WH"
25000000
starting training


Package 'fastparquet' is not installed in the local environment. Your UDF might not work when the package is installed on the server but not on your local environment.
The version of package 'cachetools' in the local environment is 5.3.2, which does not fit the criteria for the requirement 'cachetools<6'. Your UDF might not work when the package version is different between the server and your local environment.


total rows: 25000000 total time: 17.462916429837545 seed: 812
