Here are several examples of how to use Snowpark ML to train a model in Snowflake.
They include:
    - Snowflake ML with XGBoost
    - Snowflake ML with SciKit
    - Stored Procedure for Training a Model with Scikit
    - Stored Procedures for Training a Model with Pytorch

One of the biggest source of errors is versioning. If you are using Snowflake warehouses, make sure your local environment matches the version of the warehouse. Similarly, when you define stored procedures you can specify the version of packages you are using. Check out the custom packages in the stored procedures for more information. https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/latest/snowpark/api/snowflake.snowpark.Session.custom_package_usage_config#snowflake.snowpark.Session.custom_package_usage_config


Resources:
- Snowflake ML Model Docs: https://docs.snowflake.com/en/developer-guide/snowpark-ml/modeling (suppoorts XGBoost, SciKit, and LightGBM) 
- Snowpark ML API: https://docs.snowflake.com/en/developer-guide/snowpark-ml/reference/latest/index- 
- ML Quickstart: https://quickstarts.snowflake.com/guide/intro_to_machine_learning_with_snowpark_ml_for_python/ 
- Rajiv's Snowflake Notebooks: https://github.com/rajshah4/snowflake-notebooks


Tips: 
- Use Optimized Warehouses: https://docs.snowflake.com/en/user-guide/warehouses-snowpark-optimized
- You can use the MAX_CONCURRENCY_LEVEL parameter to limit the number of concurrent queries running in a warehouse.
- Container services for GPUs and support any python package 

In [26]:
# Snowpark for Python
from snowflake.snowpark.session import Session
from snowflake.snowpark.types import Variant
from snowflake.snowpark.version import VERSION

# Snowpark ML
# Misc
import pandas as pd
import json
import logging 
logger = logging.getLogger("snowflake.snowpark.session")
logger.setLevel(logging.ERROR)

from snowflake import connector
from snowflake.ml.utils import connection_params

from snowflake.ml import version
SOURCE = "SnowML"
MLVersion = version.VERSION

## Establish Secure Connection to Snowflake

Using the Snowpark Python API, it’s quick and easy to establish a secure connection between Snowflake and Notebook.
 *Note: Other connection options include Username/Password, MFA, OAuth, Okta, SSO*

I like to store my credentials in creds.json so they aren't in the notebook.
The file should look like this:
```
{
    "account": "awb99999",
    "user": "your_user_name",
    "password": "your_password",
    "warehouse": "your_warehouse"
  }

In [2]:
from snowflake.snowpark import Session
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend

with open('../creds.json') as f:
    data = json.load(f)
    USERNAME = data['user']
    SF_ACCOUNT = data['account']
    SF_WH = data['warehouse']
    passphrase = data['passphrase']

# Read the private key from the .p8 file
with open('../rsa_key.p8', 'rb') as key_file:
    private_key = key_file.read()

# If the private key is encrypted, load it with a passphrase
# Replace 'your_key_passphrase' with your actual passphrase if needed
private_key_obj = serialization.load_pem_private_key(
    private_key,
    password=passphrase.encode() if passphrase else None,
    backend=default_backend()
)

# Define connection parameters including the private key
CONNECTION_PARAMETERS = {
    'user': USERNAME,
    'account': SF_ACCOUNT,
    'private_key': private_key_obj,
    'warehouse': SF_WH,
   # 'password': PASSWORD,
}

# Create a session with the specified connection parameters
session = Session.builder.configs(CONNECTION_PARAMETERS).create()


In [30]:
snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('User                      : {}'.format(snowflake_environment[0][0]))
print('Role                      : {}'.format(session.get_current_role()))
print('Database                  : {}'.format(session.get_current_database()))
print('Schema                    : {}'.format(session.get_current_schema()))
print('Warehouse                 : {}'.format(session.get_current_warehouse()))
print('Snowflake version         : {}'.format(snowflake_environment[0][1]))
print('Snowpark-snowpark-python  : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))
print('Snowflake-ml-python       : {}'.format(MLVersion))

User                      : RSHAH
Role                      : "RAJIV"
Database                  : "RAJIV"
Schema                    : "PUBLIC"
Warehouse                 : "RAJIV"
Snowflake version         : 8.27.1
Snowpark-snowpark-python  : 1.15.0
Snowflake-ml-python       : 1.5.3


## Train a Model with Snowpark ML and XGBoost

In [4]:
import numpy as np
import pandas as pd
import random
import string

from sklearn.datasets import make_regression
from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.snowpark import Session

from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.model_selection import GridSearchCV

from snowflake.ml.modeling.preprocessing import StandardScaler, OrdinalEncoder
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.compose import ColumnTransformer

# Create a session with your preferred method
# session =

NUMERICAL_COLS = ["X1", "X2", "X3"]
CATEGORICAL_COLS = ["C1", "C2", "C3"]
FEATURE_COLS = NUMERICAL_COLS + CATEGORICAL_COLS
CATEGORICAL_OUTPUT_COLS = ["C1_OUT", "C2_OUT", "C3_OUT"]
FEATURE_OUTPUT_COLS = ["X1_FEAT_OUT", "X2_FEAT_OUT", "X3_FEAT_OUT"]

# Create a dataset with numerical and categorical features
X, y = make_regression(
    n_samples=1000,
    n_features=3,
    noise=0.1,
    random_state=0,
)
X = pd.DataFrame(X, columns=NUMERICAL_COLS)
X['TARGET'] = y

def generate_random_string(length):
    return "".join(random.choices(string.ascii_uppercase, k=length))

categorical_feature_length = 2
categorical_features = {}
for c in CATEGORICAL_COLS:
    categorical_column = [generate_random_string(categorical_feature_length) for _ in range(X.shape[0])]
    categorical_features[c] = categorical_column

X = X.assign(**categorical_features)

features_df = session.create_dataframe(X)

# Fit a pipeline with OrdinalEncoder and MinMaxScaler on Snowflake
numeric_features = ["X1", "X2", "X3"]
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_cols = ["C1", "C2", "C3"]
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-99999))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_cols)
        ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', XGBRegressor())])

In [5]:
X

Unnamed: 0,X1,X2,X3,TARGET,C1,C2,C3
0,-1.180286,1.012168,-0.280448,-6.685136,ZJ,VW,ZI
1,0.733595,2.011864,0.303011,261.472430,DP,VM,RM
2,-0.513903,-0.768849,0.988241,-52.651253,EB,FJ,AN
3,1.037586,0.018792,1.392518,158.522375,YZ,JK,IH
4,-0.355029,-1.892362,-0.300479,-222.425070,IG,ZH,QJ
...,...,...,...,...,...,...,...
995,-0.298760,-1.102230,0.699136,-85.555059,XI,RN,GM
996,0.400157,0.978738,1.764052,224.479103,AO,WK,PV
997,-0.114335,0.743554,0.026247,63.462753,CC,ZB,VF
998,1.648135,0.164228,-1.471835,50.151860,JV,MS,KF


In [6]:
 ## Distributed HyperParameter Optimization
hyper_param = dict(
        model__max_depth=[2,4],
        model__learning_rate=[0.1,0.3],
    )

xg_model = GridSearchCV(
    estimator=pipeline,
    param_grid=hyper_param,
    #cv=5,
    input_cols= numeric_features + categorical_cols
,    label_cols=['TARGET'],
    output_cols=["TARGET_FORECAST"],
)

# Fit and Score
xg_model.fit(features_df)
##Takes 25 seconds

<snowflake.ml.modeling.model_selection.grid_search_cv.GridSearchCV at 0x16d864dd0>

In [7]:
testpreds = xg_model.predict(features_df)
testpreds.show()

---------------------------------------------------------------------------------------------------------------------------------------
|"X1"                 |"X2"                   |"X3"                   |"TARGET"             |"C1"  |"C2"  |"C3"  |"TARGET_FORECAST"   |
---------------------------------------------------------------------------------------------------------------------------------------
|-1.1802856063511906  |1.012168295174788      |-0.28044778146745414   |-6.685136199898421   |ZJ    |VW    |ZI    |3.5716018676757812  |
|0.7335948682293696   |2.0118642631265615     |0.30301105045615745    |261.4724299705983    |DP    |VM    |RM    |255.3582763671875   |
|-0.5139029502799155  |-0.7688491596748099    |0.9882405737426969     |-52.65125274455872   |EB    |FJ    |AN    |-43.4571533203125   |
|1.037585667050634    |0.018791791774257802   |1.3925184494342724     |158.52237515962273   |YZ    |JK    |IH    |152.28465270996094  |
|-0.3550287310553741  |-1.8923618933173414    |-

## Train a Model with Snowpark ML and SKLearn

In [8]:
#from sklearn.ensemble import IsolationForest
from snowflake.ml.modeling.ensemble import IsolationForest
from snowflake.ml.modeling.ensemble import RandomForestRegressor

from sklearn.metrics import make_scorer, roc_auc_score

# Define a custom scoring function
scoring = make_scorer(roc_auc_score, needs_proba=True)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', IsolationForest())])
#pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', RandomForestRegressor())])


 ## Distributed HyperParameter Optimization
hyper_param = dict(
        model__n_estimators = [20,50,200]
       # model__learning_rate=[0.1,0.3],
    )

iso_model = GridSearchCV(
    estimator=pipeline,
    param_grid=hyper_param,
    #cv=5,
    scoring=scoring,
    input_cols= numeric_features + categorical_cols,    
  #  label_cols=['TARGET'],
    output_cols=["TARGET_FORECAST"],
)

# Fit and Score
iso_model.fit(features_df)
##Takes 25 seconds

<snowflake.ml.modeling.model_selection.grid_search_cv.GridSearchCV at 0x16c0bbe50>

In [9]:
testpreds = iso_model.predict(features_df)
testpreds.show()

--------------------------------------------------------------------------------------------------------------------------------------
|"X1"                 |"X2"                   |"X3"                   |"TARGET"             |"C1"  |"C2"  |"C3"  |"TARGET_FORECAST"  |
--------------------------------------------------------------------------------------------------------------------------------------
|-1.1802856063511906  |1.012168295174788      |-0.28044778146745414   |-6.685136199898421   |ZJ    |VW    |ZI    |-1                 |
|0.7335948682293696   |2.0118642631265615     |0.30301105045615745    |261.4724299705983    |DP    |VM    |RM    |-1                 |
|-0.5139029502799155  |-0.7688491596748099    |0.9882405737426969     |-52.65125274455872   |EB    |FJ    |AN    |-1                 |
|1.037585667050634    |0.018791791774257802   |1.3925184494342724     |158.52237515962273   |YZ    |JK    |IH    |1                  |
|-0.3550287310553741  |-1.8923618933173414    |-0.30047

## Housing Dataset for Stored Procedures

In [10]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
# Fetch the California housing dataset
california_housing = fetch_california_housing()
california_housing_df = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
california_housing_df['MEDHOUSEVAL'] = california_housing.target
california_housing_df.columns = california_housing_df.columns.str.upper()


In [11]:
input_df = session.create_dataframe(california_housing_df)
schema = input_df.schema
print(schema)

StructType([StructField('MEDINC', DoubleType(), nullable=True), StructField('HOUSEAGE', DoubleType(), nullable=True), StructField('AVEROOMS', DoubleType(), nullable=True), StructField('AVEBEDRMS', DoubleType(), nullable=True), StructField('POPULATION', DoubleType(), nullable=True), StructField('AVEOCCUP', DoubleType(), nullable=True), StructField('LATITUDE', DoubleType(), nullable=True), StructField('LONGITUDE', DoubleType(), nullable=True), StructField('MEDHOUSEVAL', DoubleType(), nullable=True)])


In [12]:
input_df.write.mode('overwrite').save_as_table('TRAIN')

In [13]:
df = session.table('TRAIN')
df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------
|"MEDINC"  |"HOUSEAGE"  |"AVEROOMS"          |"AVEBEDRMS"         |"POPULATION"  |"AVEOCCUP"          |"LATITUDE"  |"LONGITUDE"  |"MEDHOUSEVAL"  |
--------------------------------------------------------------------------------------------------------------------------------------------------
|8.3252    |41.0        |6.984126984126984   |1.0238095238095237  |322.0         |2.5555555555555554  |37.88       |-122.23      |4.526          |
|8.3014    |21.0        |6.238137082601054   |0.9718804920913884  |2401.0        |2.109841827768014   |37.86       |-122.22      |3.585          |
|7.2574    |52.0        |8.288135593220339   |1.073446327683616   |496.0         |2.8022598870056497  |37.85       |-122.24      |3.521          |
|5.6431    |52.0        |5.8173515981735155  |1.0730593607305936  |558.0         |2.547945205479452   |37.85       |-1

## Stored Procedure for Training a Model with Scikit

In [14]:
def housing_model(
    session: Session, 
    features_table: str, 
    number_of_folds: int, 
    train_accuracy_threshold: float, 
    test_accuracy_threshold: float, 
    save_model: bool) -> Variant:
    
    import os

    from joblib import dump
    from sklearn.compose import ColumnTransformer
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import GridSearchCV, train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import PolynomialFeatures, StandardScaler


    from snowflake.ml.modeling.pipeline import Pipeline
    from snowflake.ml.modeling.preprocessing import StandardScaler, OrdinalEncoder
    from snowflake.ml.modeling.impute import SimpleImputer
    from snowflake.ml.modeling.compose import ColumnTransformer

    from sklearn.compose import ColumnTransformer
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import GridSearchCV, train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import PolynomialFeatures, StandardScaler

    # Load features
    df = session.table('TRAIN').to_pandas()

    # Preprocess the Numeric columns
    # We apply PolynomialFeatures and StandardScaler preprocessing steps to the numeric columns
    # NOTE: High degrees can cause overfitting.
    numeric_features = ['MEDINC','HOUSEAGE','AVEROOMS','AVEBEDRMS','AVEOCCUP','POPULATION','LATITUDE','LONGITUDE']
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

    # Combine the preprocessed step together using the Column Transformer module
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)])

    # The next step is the integrate the features we just preprocessed with our Machine Learning algorithm to enable us to build a model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LinearRegression())])
    parameteres = {}

    X = df.drop('MEDHOUSEVAL', axis = 1)
    y = df['MEDHOUSEVAL']

    # Split dataset into training and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

    # Use GridSearch to find the best fitting model based on number_of_folds folds
    model = GridSearchCV(pipeline, param_grid=parameteres, cv=number_of_folds)

    model.fit(X_train, y_train)
    train_r2_score = model.score(X_train, y_train)
    test_r2_score = model.score(X_test, y_test)

    model_saved = False

    if save_model:
        if train_r2_score >= train_accuracy_threshold and test_r2_score >= test_accuracy_threshold:
            # Upload trained model to a stage
            model_output_dir = '/tmp'
            model_file = os.path.join(model_output_dir, 'model.joblib')
            dump(model, model_file)
            session.file.put(model_file,"@PYTHON_MODELS",overwrite=True)
            model_saved = True

    # Return model R2 score on train and test data
    return {"R2 score on Train": train_r2_score,
            "R2 threshold on Train": train_accuracy_threshold,
            "R2 score on Test": test_r2_score,
            "R2 threshold on Test": test_accuracy_threshold,
            "Model saved": model_saved}

In [15]:
cross_validaton_folds = 3
train_accuracy_threshold = 0.85
test_accuracy_threshold = 0.85
save_model = False

housing_model(
    session,
    "TRAIN",
    cross_validaton_folds,
    train_accuracy_threshold,
    test_accuracy_threshold,
    save_model)

{'R2 score on Train': 0.6125511913966952,
 'R2 threshold on Train': 0.85,
 'R2 score on Test': 0.5757877060324508,
 'R2 threshold on Test': 0.85,
 'Model saved': False}

In [16]:
session.sproc.register(
    func=housing_model,
    name="Train_housing_model",
    packages=['snowflake-ml-python','scikit-learn','joblib'],
    is_permanent=True,
    stage_location="@MODELS", ## I created this stage
    replace=True)

<snowflake.snowpark.stored_procedure.StoredProcedure at 0x1686e4c10>

In [17]:
cross_validaton_folds = 3
train_accuracy_threshold = 0.85
test_accuracy_threshold = 0.85
save_model = False

print(session.call('Train_housing_model',
                    'TRAIN',
                    cross_validaton_folds,
                    train_accuracy_threshold,
                    test_accuracy_threshold,
                    save_model))

{
  "Model saved": false,
  "R2 score on Test": 5.757877060324507e-01,
  "R2 score on Train": 6.125511913966952e-01,
  "R2 threshold on Test": 8.500000000000000e-01,
  "R2 threshold on Train": 8.500000000000000e-01
}


## Stored Procedure for Training a Model with Pytorch

In [18]:
def torch_housing_model(
    session: Session, 
    features_table: str, 
    number_of_folds: int, 
    save_model: bool) -> Variant:
    
    import os

    from joblib import dump
    from sklearn.compose import ColumnTransformer
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import GridSearchCV, train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import PolynomialFeatures, StandardScaler

    from snowflake.ml.modeling.pipeline import Pipeline
    from snowflake.ml.modeling.preprocessing import StandardScaler, OrdinalEncoder
    from snowflake.ml.modeling.impute import SimpleImputer
    from snowflake.ml.modeling.compose import ColumnTransformer

    from sklearn.compose import ColumnTransformer
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import GridSearchCV, train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import PolynomialFeatures, StandardScaler

    from sklearn.preprocessing import StandardScaler
    import torch
    import torch.nn as nn
    import torch.optim as optim

    # Load features
    df = session.table('TRAIN').to_pandas()
    df = df.head(100) 

    # Preprocess the Numeric columns
    # We apply PolynomialFeatures and StandardScaler preprocessing steps to the numeric columns
    # NOTE: High degrees can cause overfitting.
    numeric_features = ['MEDINC','HOUSEAGE','AVEROOMS','AVEBEDRMS','AVEOCCUP','POPULATION','LATITUDE','LONGITUDE']
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

    # Combine the preprocessed step together using the Column Transformer module
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)])

    # The next step is the integrate the features we just preprocessed with our Machine Learning algorithm to enable us to build a model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LinearRegression())])
    parameteres = {}

    X = df.drop('MEDHOUSEVAL', axis = 1)
    y = df['MEDHOUSEVAL']

    # Split dataset into training and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32).view(-1, 1)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.float32).view(-1, 1)

    # Define the smaller neural network model
    class SmallerHousingModel(nn.Module):
        def __init__(self):
            super(SmallerHousingModel, self).__init__()
            self.fc1 = nn.Linear(8, 64)
            self.fc2 = nn.Linear(64, 32)
            self.fc3 = nn.Linear(32, 1)
            self.dropout = nn.Dropout(0.3)
            self.relu = nn.ReLU()
        
        def forward(self, x):
            x = self.relu(self.fc1(x))
            x = self.dropout(x)
            x = self.relu(self.fc2(x))
            x = self.dropout(x)
            x = self.fc3(x)
            return x

    # Instantiate the model, define the loss function and the optimizer
    model = SmallerHousingModel()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        
        loss.backward()
        optimizer.step()
        
        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_loss = criterion(test_outputs, y_test_tensor)
        test_loss_float = test_loss.item()  #
        print(f'Test Loss: {test_loss_float:.4f}')
        
    model_saved = False
    test_loss = 0.2 #   if save_model:
 #         # Upload trained model to a stage
 #           model_output_dir = '/tmp'
 #           model_file = os.path.join(model_output_dir, 'model.joblib')
 #           dump(model, model_file)
 #           session.file.put(model_file,"@PYTHON_MODELS",overwrite=True)
  #          model_saved = True

    # Return model R2 score on train and test data
    return {"R2 score on Train": test_loss_float,
        "Model saved": model_saved}

In [19]:
cross_validaton_folds = 3
save_model = False

torch_housing_model(
    session,
    "TRAIN",
    cross_validaton_folds,
    save_model)

Epoch [10/10], Loss: 2.9312
Test Loss: 3.8128


{'R2 score on Train': 3.8128409385681152, 'Model saved': False}

In [20]:
session.sproc.register(
    func=torch_housing_model,
    name="torch_housing_model",
    packages=['snowflake-ml-python','scikit-learn','joblib','pytorch'],
    is_permanent=True,
    stage_location="@MODELS", ## I created this stage
    replace=True)

<snowflake.snowpark.stored_procedure.StoredProcedure at 0x3041a88d0>

In [21]:
cross_validaton_folds = 3
save_model = False

print(session.call('torch_housing_model',
                    'TRAIN',
                    cross_validaton_folds,
                    save_model))

{
  "Model saved": false,
  "R2 score on Train": 2.532326459884644e+00
}
