In [1]:
import json
import numpy as np
import pandas as pd
import os
import sys
import time

from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import sproc, col
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T

from snowflake.snowpark.types import PandasDataFrameType, IntegerType, StringType, FloatType, Variant
from snowflake.snowpark.exceptions import SnowparkSQLException

import torch
import torch.distributed as dist
from torch import nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
import os
import torch.distributed as dist
from snowflake.ml.fileset import fileset



In [2]:
# Reading Snowflake Connection Details
snowflake_connection_cfg = json.loads(open("/Users/mitaylor/Documents/creds/creds.json").read())

# Creating Snowpark Session
session = Session.builder.configs(snowflake_connection_cfg).create()

# Create a fresh & new schema
session.sql("CREATE OR REPLACE DATABASE PYTORCH_DEMO").collect()
session.sql('''CREATE OR REPLACE STAGE UDF_STAGE''').collect()
session.sql('''CREATE OR REPLACE STAGE FILESET_DEMO
  DIRECTORY = ( ENABLE = true )
  encryption=(type='SNOWFLAKE_SSE')''').collect()

session.sql("CREATE OR REPLACE WAREHOUSE ASYNC_WH WITH WAREHOUSE_SIZE='X-SMALL'").collect()

[Row(status='Warehouse ASYNC_WH successfully created.')]

# 1. Create a Data Set

# 1.1 Load some arbitrary data into Snowflake

In [3]:
from sklearn.datasets import make_classification
import pandas as pd
columns = [str(i) for i in range(0,10)]
X,y = make_classification(n_samples=100000, n_features=10, n_classes=2)
X = np.array(X, dtype=np.float32)
df = pd.DataFrame(X, columns=columns)
feature_cols = ["COL" + i for i in df.columns]
df.columns = feature_cols
df['Y'] = y
session.write_pandas(df, table_name='DUMMY_DATASET', auto_create_table=True, overwrite=True)

<snowflake.snowpark.table.Table at 0x7fe578171780>

## 1.2 Create a Fileset Snapshot

In [4]:
sdf = session.table('DUMMY_DATASET')
train_sdf, test_sdf = sdf.random_split(weights=[0.8, 0.2], seed=0)
train_sdf.write.mode('overwrite').save_as_table('DUMMY_DATASET_TRAIN')
test_sdf.write.mode('overwrite').save_as_table('DUMMY_DATASET_TEST')

In [5]:
FS_STAGE_NAME = "FILESET_DEMO"
fileset_train_sdf = fileset.FileSet.make(
    target_stage_loc=f"@{session.get_current_database()}.{session.get_current_schema()}.{FS_STAGE_NAME}/",
    name="DUMMY_FILESET_TRAIN",
    snowpark_dataframe=train_sdf,
    shuffle=True,
)

fileset_test_sdf = fileset.FileSet.make(
    target_stage_loc=f"@{session.get_current_database()}.{session.get_current_schema()}.{FS_STAGE_NAME}/",
    name="DUMMY_FILESET_TEST",
    snowpark_dataframe=test_sdf,
    shuffle=True,
)

FileSet.files() is in private preview since 0.2.0. Do not use it in production. 
SFFileSystem.ls() is in private preview since 0.2.0. Do not use it in production. 


# 1.3 Get the Filset locally 

In [6]:
session.sql("GET @FILESET_DEMO/DUMMY_FILESET_TRAIN 'file:///Users/mitaylor/Documents/GitHub/AA Cleaned Repos/simple-pytorch-example/data/train' ").collect()
session.sql("GET @FILESET_DEMO/DUMMY_FILESET_TEST 'file:///Users/mitaylor/Documents/GitHub/AA Cleaned Repos/simple-pytorch-example/data/test' ").collect()

[Row(file='data_01b3fede-0000-e048-0000-f14900afc156_016_1_0.snappy.parquet', size=1187494, status='DOWNLOADED', message='')]

# 2. Build Neural Net In Pytorch

## 2.1 Prep the Data

In [7]:
X_tens = torch.tensor(X)
y_tens = torch.tensor(y)

In [8]:
# convert into PyTorch tensors
X_tens = torch.empty_like(X_tens).copy_(X_tens)
y_tens = torch.empty_like(y_tens).copy_(y_tens).reshape(-1, 1)
loader = DataLoader(list(zip(X_tens,y_tens)), shuffle=True, batch_size=16)


In [9]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(10, 10),
            nn.ReLU(),
            nn.Linear(10, 1),
            nn.ReLU(),
        )

    def forward(self, tensor:torch.Tensor):
        return self.model(tensor)

# 3. Train the Neural Network

In [10]:
def train_model():
    n_epochs = 5
    device = 'cpu'
    model = MyModel()
    model = model.to(device)
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    start_time = time.time()
    
    # Training step
    for epoch in range(n_epochs):
        current_loss = 0.0
    
        for batch, (X, y) in enumerate(loader):
    
            X_batch, y_batch = X.to(device), y.to(device)
            # forward pass
            y_pred = model(X_batch)
    
            # compute loss
            loss = loss_fn(y_pred.float(), y_batch.float())
    
            # backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            current_loss += loss.item()
    
        if epoch % 10 == 0:
           print(f"Loss after epoch {epoch}: {current_loss}")
           for param in model.parameters():
                print(param.data)
    
    end_time = time.time()
    print('Model training complete.')
    print(f'Training time: {end_time-start_time}')
    return model

In [11]:
model = train_model()

Loss after epoch 0: 485.9050761987455
tensor([[-1.5332e-02, -4.2688e-02, -4.2648e-01, -2.4796e-02,  2.5162e-02,
          2.4555e-02,  5.9491e-01, -3.6272e-02,  3.9479e-02,  1.0484e-01],
        [-4.0024e-01, -8.4230e-03, -4.1975e-01, -3.4622e-02,  2.0299e-02,
          3.9257e-02,  1.6368e-02,  2.1771e-02, -2.3260e-02,  1.4093e-01],
        [-4.3346e-01, -8.6338e-02,  2.0438e-01, -5.8506e-03,  1.2677e-01,
         -7.5516e-02, -1.7939e-01,  1.4602e-01,  6.1440e-02,  1.5228e-01],
        [ 3.0335e-01,  4.6633e-02, -1.3300e-01, -5.7427e-02,  7.2116e-04,
          3.3467e-02, -2.2827e-02, -1.4657e-02, -1.9141e-02, -3.0798e-01],
        [ 1.0250e-01,  1.3639e-01,  4.7008e-02,  2.1274e-03, -6.2882e-02,
          8.1276e-02, -2.9430e-01, -9.4658e-02, -7.4674e-02, -8.9625e-02],
        [ 1.2783e-01,  7.4283e-03,  5.1467e-01, -1.3684e-02,  1.3764e-02,
         -1.7808e-02, -8.1767e-01,  1.2480e-02,  4.6685e-03,  1.0464e-01],
        [ 5.3483e-01,  1.7646e-02,  2.4456e-01, -7.6937e-03, -2.2919

# 4. Deploy model (into Registry, then into a UDF)

In [12]:
from snowflake.ml.registry import registry

REGISTRY_DATABASE_NAME = "PYTORCH_DEMO"
REGISTRY_SCHEMA_NAME = "PUBLIC"
native_registry = registry.Registry(session=session, database_name=REGISTRY_DATABASE_NAME, schema_name=REGISTRY_SCHEMA_NAME)

In [13]:
model_ref = native_registry.log_model(
    model,
    model_name="torchModel",
    version_name="v1",
    sample_input_data=[X_tens],
)

  return next(self.gen)


In [14]:
model_ref.show_functions()

[{'name': 'FORWARD',
  'target_method': 'forward',
  'signature': ModelSignature(
                      inputs=[
                          FeatureSpec(dtype=DataType.FLOAT, name='input_feature_0', shape=(10,))
                      ],
                      outputs=[
                          FeatureSpec(dtype=DataType.FLOAT, name='output_feature_0', shape=(1,))
                      ]
                  )}]

In [15]:
model_ref.run([X_tens])

Unnamed: 0,output_feature_0
0,[0.9778361320495605]
1,[0.9840174913406372]
2,[0.5009428262710571]
3,[0]
4,[0]
...,...
99995,[0.8472687005996704]
99996,[1.0193544626235962]
99997,[0]
99998,[0]


In [23]:
input_data_df = session.sql("select COL0, COL1, COL2, COL3, COL4, COL5, COL6, COL7, COL8, COL9 FROM DUMMY_DATASET")
input_data_df = input_data_df.with_column('"input_feature_0"', F.array_construct('*'))
input_data_df.limit(1).to_pandas()

Unnamed: 0,COL0,COL1,COL2,COL3,COL4,COL5,COL6,COL7,COL8,COL9,input_feature_0
0,1.393693,0.851332,0.751124,-0.400523,0.545574,0.103497,1.151555,-0.023779,0.024589,-0.319839,"[\n 1.393693327903748e+00,\n 8.5133218765258..."


In [25]:
predictions_df = model_ref.run(input_data_df)
predictions_df.limit(3).to_pandas()

Unnamed: 0,COL0,COL1,COL2,COL3,COL4,COL5,COL6,COL7,COL8,COL9,input_feature_0,output_feature_0
0,1.393693,0.851332,0.751124,-0.400523,0.545574,0.103497,1.151555,-0.023779,0.024589,-0.319839,"[\n 1.393693327903748e+00,\n 8.5133218765258...",[\n 0.9778361320495605\n]
1,2.148875,2.221024,1.557,0.29507,-0.297229,0.200594,1.314233,0.361357,-0.862365,-0.578818,"[\n 2.148874759674072e+00,\n 2.2210237979888...",[\n 0.9840174913406372\n]
2,0.445368,1.203671,0.711755,1.543558,-0.279149,0.710193,-0.17756,0.044377,-1.993368,-0.203527,"[\n 4.453682005405426e-01,\n 1.2036712169647...",[\n 0.5009428262710571\n]


# 5. Run it on a Fileset in Snowflake

# Do the next cell in a sproc or UDF for server side inference

In [32]:
# Use FileSet to get data from a Snowflake table in the form of files in an internal server-side excrypted stage
from torch.utils.data import DataLoader

STAGE_NAME = "FILESET_DEMO"
fileset_test_df = fileset.FileSet(
    target_stage_loc=f"@{session.get_current_database()}.{session.get_current_schema()}.{STAGE_NAME}/",
    name="DUMMY_FILESET_TEST",
    snowpark_session=session,
)




In [34]:
def get_batch(batch):
    X_batch = torch.column_stack(
        (
            batch["COL0"],
            batch["COL1"],
            batch["COL2"],
            batch["COL3"],
            batch["COL4"],
            batch["COL5"],
            batch["COL6"],
            batch["COL7"],
            batch["COL8"],
            batch["COL9"],
        )
    )

    return X_batch

In [38]:
pipe = fileset_test_df.to_torch_datapipe(
    batch_size=16,
    shuffle=True,
    drop_last_batch=True)

for batch in DataLoader(pipe, batch_size=None, num_workers=0):
    X_batch = get_batch(batch)
    model_ref.run([X_batch])
    break

In [39]:
model_ref.run([X_batch])

Unnamed: 0,output_feature_0
0,[0.9835563898086548]
1,[0]
2,[0.840322732925415]
3,[1.0599533319473267]
4,[0.9055103063583374]
5,[1.0938479900360107]
6,[0]
7,[0.9257052540779114]
8,[0]
9,[0.9871610403060913]
