In [1]:
import json
import numpy as np
import pandas as pd
import os
import sys
import time

from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import sproc, col
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T

from snowflake.snowpark.types import PandasDataFrameType, IntegerType, StringType, FloatType, Variant
from snowflake.snowpark.exceptions import SnowparkSQLException

import torch
import torch.distributed as dist
from torch import nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
import os
import torch.distributed as dist
from snowflake.ml.fileset import fileset



In [2]:
# Reading Snowflake Connection Details
snowflake_connection_cfg = json.loads(open("/Users/mitaylor/Documents/creds/creds.json").read())

# Creating Snowpark Session
session = Session.builder.configs(snowflake_connection_cfg).create()

# Create a fresh & new schema
session.sql("CREATE OR REPLACE DATABASE PYTORCH_DEMO").collect()
session.sql('''CREATE OR REPLACE STAGE UDF_STAGE''').collect()
session.sql('''CREATE OR REPLACE STAGE FILESET_DEMO
  DIRECTORY = ( ENABLE = true )
  encryption=(type='SNOWFLAKE_SSE')''').collect()

session.sql("CREATE OR REPLACE WAREHOUSE ASYNC_WH WITH WAREHOUSE_SIZE='X-SMALL'").collect()

[Row(status='Warehouse ASYNC_WH successfully created.')]

# 1. Create a Data Set

# 1.1 Load some arbitrary data into Snowflake

In [3]:
from sklearn.datasets import make_classification
import pandas as pd
columns = [str(i) for i in range(0,10)]
X,y = make_classification(n_samples=100000, n_features=10, n_classes=2)
X = np.array(X, dtype=np.float32)
df = pd.DataFrame(X, columns=columns)
feature_cols = ["COL" + i for i in df.columns]
df.columns = feature_cols
df['Y'] = y
session.write_pandas(df, table_name='DUMMY_DATASET', auto_create_table=True, overwrite=True)

<snowflake.snowpark.table.Table at 0x7fc260f2f6d0>

## 1.2 Create a Fileset Snapshot

In [4]:
sdf = session.table('DUMMY_DATASET')
train_sdf, test_sdf = sdf.random_split(weights=[0.8, 0.2], seed=0)
train_sdf.write.mode('overwrite').save_as_table('DUMMY_DATASET_TRAIN')
test_sdf.write.mode('overwrite').save_as_table('DUMMY_DATASET_TEST')

In [5]:
FS_STAGE_NAME = "FILESET_DEMO"
fileset_train_sdf = fileset.FileSet.make(
    target_stage_loc=f"@{session.get_current_database()}.{session.get_current_schema()}.{FS_STAGE_NAME}/",
    name="DUMMY_FILESET_TRAIN",
    snowpark_dataframe=train_sdf,
    shuffle=True,
)

fileset_test_sdf = fileset.FileSet.make(
    target_stage_loc=f"@{session.get_current_database()}.{session.get_current_schema()}.{FS_STAGE_NAME}/",
    name="DUMMY_FILESET_TEST",
    snowpark_dataframe=test_sdf,
    shuffle=True,
)

FileSet.files() is in private preview since 0.2.0. Do not use it in production. 
SFFileSystem.ls() is in private preview since 0.2.0. Do not use it in production. 


# 1.3 Get the Filset locally 

In [6]:
session.sql("GET @FILESET_DEMO/DUMMY_FILESET_TRAIN 'file:///Users/mitaylor/Documents/GitHub/AA Cleaned Repos/simple-pytorch-example/data/train' ").collect()
session.sql("GET @FILESET_DEMO/DUMMY_FILESET_TEST 'file:///Users/mitaylor/Documents/GitHub/AA Cleaned Repos/simple-pytorch-example/data/test' ").collect()

[Row(file='data_01b403f2-0000-e099-0000-f14900b1365e_016_1_0.snappy.parquet', size=1187446, status='DOWNLOADED', message='')]

# 2. Build Neural Net In Pytorch

## 2.1 Prep the Data

In [35]:
def sproc_training(session: Session) -> Variant:
    import torch
    import torch.distributed as dist
    from torch import nn
    import torch.optim as optim
    from torch.nn.parallel import DistributedDataParallel as DDP
    from torch.utils.data import DataLoader
    import os
    import torch.distributed as dist
    from snowflake.ml.fileset import fileset

    def get_batch(batch):
        X_batch = torch.column_stack(
            (
                batch["COL0"],
                batch["COL1"],
                batch["COL2"],
                batch["COL3"],
                batch["COL4"],
                batch["COL5"],
                batch["COL6"],
                batch["COL7"],
                batch["COL8"],
                batch["COL9"],
            )
        )
        return X_batch
    
    class MyModel(nn.Module):
        def __init__(self):
            super(MyModel, self).__init__()
            self.model = nn.Sequential(
                nn.Linear(10, 10),
                nn.ReLU(),
                nn.Linear(10, 1),
                nn.ReLU(),
            )
    
        def forward(self, tensor:torch.Tensor):
            return self.model(tensor)
   
    def train_model(loader):
        n_epochs = 5
        device = 'cpu'

        # Define model & training params
        model = MyModel()

        #########
        # # Distributed Data Parallel wrapper which will take care of model weights averaging and syncing
        # # This works for the case where the model weights can fit in a single CPU/GPU but the data is too large and can be split
        # os.environ['MASTER_ADDR'] = 'localhost'
        # os.environ['MASTER_PORT'] = '12355'
        # dist.init_process_group("gloo", rank=1, world_size=4) # Use NCCL backend for distributed GPU training
        # model = model.to(device)
        # model = DDP(model, device_ids=[device], output_device=device)
        #########
        
        loss_fn = nn.MSELoss()
        optimizer = optim.SGD(model.parameters(), lr=0.1)
        start_time = time.time()
        
        # Training step
        for epoch in range(n_epochs):
            current_loss = 0.0
            for i, batch in enumerate(loader):

                X_batch = get_batch(batch)
                y_batch = torch.column_stack((batch["Y"],))

                # forward pass
                y_pred = model(X_batch)
        
                # compute loss
                loss = loss_fn(y_pred.float(), y_batch.float())
        
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        
                current_loss += loss.item()
        
            if epoch % 10 == 0:
               print(f"Loss after epoch {epoch}: {current_loss}")
               for param in model.parameters():
                    print(param.data)
        
        end_time = time.time()
        print('Model training complete.')
        print(f'Training time: {end_time-start_time}')
        return model, X_batch, current_loss   


    # Use FileSet to get data from a Snowflake table in the form of files in an internal server-side excrypted stage
    fileset_train_df = fileset.FileSet(
        target_stage_loc="@PYTORCH_DEMO.PUBLIC.FILESET_DEMO/",
        name="DUMMY_FILESET_TRAIN",
        snowpark_session=session,
    )

    pipe = fileset_train_df.to_torch_datapipe(
       batch_size=16,
       shuffle=True,
       drop_last_batch=True)
    loader = DataLoader(pipe, batch_size=None, num_workers=0)

    model, X_batch, current_loss = train_model(loader)

    # Register the Model
    from snowflake.ml.registry import registry
    REGISTRY_DATABASE_NAME = "PYTORCH_DEMO"
    REGISTRY_SCHEMA_NAME = "PUBLIC"
    native_registry = registry.Registry(
        session=session,
        database_name=REGISTRY_DATABASE_NAME,
        schema_name=REGISTRY_SCHEMA_NAME)
    model_ref = native_registry.log_model(
        model,
        model_name="torchModelSProc",
        version_name="v3",
        sample_input_data=[X_batch],)

    result = f"training complete, model loss: {current_loss}"
    return result

# Register sproc
sproc_training = session.sproc.register(
    func=sproc_training, 
    name='YOUR_SPROC_NAME', 
    is_permanent=True, 
    replace=True,
    stage_location='@UDF_STAGE', 
    packages=['snowflake-snowpark-python', 'pytorch', 'snowflake-ml-python', 'cryptography', 'torchdata'])

sproc_training()



'"training complete, model loss: 283.8899168477219"'