In [1]:
import pickle
import random
import string
from typing import List

from snowflake.ml.transformers.preprocessing import standard_scaler, ordinal_encoder
from snowflake.ml.transformers import pipeline
from snowflake.ml.utils import connection_params
#from snowflake.ml.fileset import fileset
from snowflake import snowpark
from snowflake.snowpark import DataFrame
from snowflake.snowpark import functions as F
from snowflake.snowpark.functions import col, iff, cast, row_number
import pandas as pd
import pyarrow as pa
import os

In [2]:
connection_parameters = {
    "account": "VUA92284",
    "user": "snowflake_nvidia",
    "password": os.environ['SNOWFLAKE_TEMP_PASSWORD'],
    "role": "SNOWFLAKE_NVIDIA",  # optional
    "warehouse": "SNOWFLAKE_NVIDIA",  # medium snowpark-optimized
    "database": "SNOWFLAKE_NVIDIA",  
    "schema": "PUBLIC",  
  }
session = snowpark.Session.builder.configs(connection_parameters).create()

In [25]:
# Criteo dataset

_NUMERICAL_FEATURES = [f"I{i}" for i in range(1, 14)]
_CATEGORICAL_FEATURES = [f"C{i}" for i in range(1, 27)]
_LABEL = "LABEL"

# I sample the data before I'm confident that training works.
_SAMPLE_SIZE = 1000000

In [37]:
# "Featurization"
# In reality, the SQL below should be much more complicated and would involve joins.

select_columns = (
    [_LABEL]
    + _NUMERICAL_FEATURES
    + _CATEGORICAL_FEATURES
)
limit = f"limit {_SAMPLE_SIZE}" if _SAMPLE_SIZE else ""
all_data = session.sql(f'''select {','.join(select_columns)} from "criteo_day_0" {limit}''')
[train_data, eval_data, test_data] = all_data.random_split([0.8, 0.1, 0.1])

In [35]:
all_data.limit(5).to_pandas()


Unnamed: 0,LABEL,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13
0,0,1.0,280.0,2.0,20.0,10.0,0.0,0.0,0,2,0.0,4.0,4036.0,2.0
1,0,,17.0,,5.0,1.0,,0.0,-1,0,,1.0,20314.0,
2,0,4.0,315.0,7.0,8.0,10.0,5.0,0.0,90,8,1.0,4.0,276.0,8.0
3,0,2.0,1596.0,7.0,99.0,12.0,0.0,0.0,0,19,0.0,8.0,2764.0,7.0
4,0,1.0,3.0,,,,,,2,4,,,1234.0,


In [40]:
# Feature engineering functions
def fit_transformers(data_df: DataFrame) -> pipeline.Pipeline:
    p = pipeline.Pipeline(
        [
            (
                "scale_to_std",
                standard_scaler.StandardScaler(
                    input_cols=_NUMERICAL_FEATURES,
                    output_cols=[f"scaled_{f}" for f in _NUMERICAL_FEATURES],
                ),
            ),
            (
                "vocab",
                ordinal_encoder.OrdinalEncoder(
                    input_cols=_CATEGORICAL_FEATURES,
                    output_cols=[f"vocab_{f}" for f in _CATEGORICAL_FEATURES],
                    encoded_missing_value=-1,
                    unknown_value=-1,
                    handle_unknown="use_encoded_value",
                ),
            ),
        ]
    )

    return p.fit(data_df)


def preprocess_with_transformers(
    fitted_pipeline: pipeline.Pipeline, data_df: DataFrame
) -> DataFrame:
    transformed_data = fitted_pipeline.transform(data_df)
    # keep only transformed columns
    transformed_data = transformed_data.select(
        [col(f"scaled_{f}").alias(f) for f in _NUMERICAL_FEATURES]
        + [col(f"vocab_{f}").alias(f) for f in _CATEGORICAL_FEATURES]
        + [_LABEL]
    )
    # the following are either not implemented or something we have to fight..

    transformed_data = (
        transformed_data.na.fill(0)
        # 1. replace negative vocab values with n_vocab (since 0...n_vocab-1 are already occupied)
        # 2. cast vocab values to int (originally float)
        .select(
            [
                cast(
                    iff(
                        col(f) == -1.0,
                        len(fitted_pipeline.steps[1][1].categories_[f]),
                        col(f),
                    ),
                    "int",
                ).alias(f)
                for f in _CATEGORICAL_FEATURES
            ]
            + _NUMERICAL_FEATURES
            + [cast(_LABEL, "float").alias(_LABEL)]
        )
    )

    return transformed_data

In [41]:
# fit using only training split but transform all splits.
p = fit_transformers(train_data)

#preprocessed_train_data = preprocess_with_transformers(p, train_data)
#preprocessed_eval_data = preprocess_with_transformers(p, eval_data)
#preprocessed_test_data = preprocess_with_transformers(p, test_data)

Failed to execute query [queryID: 01aa9f79-0402-9b75-002a-c9030e12f396] ( SELECT "'C1'" AS "_COLUMN_NAME", "C1" AS "'_CATEGORY_vludloz4cs'", " CAST ((row_number() OVER (  ORDER BY ""C1"" ASC NULLS FIRST ) - 1 :: INT) AS FLOAT)" AS "_INDEX" FROM ( SELECT 'C1', "C1",  CAST ((row_number() OVER (  ORDER BY "C1" ASC NULLS FIRST ) - 1 :: INT) AS FLOAT) FROM ( SELECT  *  FROM ( SELECT "C1" FROM ( SELECT "C1" FROM ( SELECT  *  FROM SNOWPARK_TEMP_TABLE_9LRUMHHV73 WHERE (("SNOWPARK_TEMP_COLUMN_1TVRW0X3UZ" >= 0 :: INT) AND ("SNOWPARK_TEMP_COLUMN_1TVRW0X3UZ" < 800000 :: INT)))) GROUP BY "C1") WHERE "C1" IS NOT NULL ORDER BY "C1" ASC NULLS FIRST))) UNION ( SELECT "'C1'" AS "_COLUMN_NAME", "C1" AS "'_CATEGORY_vludloz4cs'", "-1 :: INT" AS "_INDEX" FROM ( SELECT 'C1', "C1", -1 :: INT FROM ( SELECT  *  FROM ( SELECT "C1" FROM ( SELECT "C1" FROM ( SELECT  *  FROM SNOWPARK_TEMP_TABLE_9LRUMHHV73 WHERE (("SNOWPARK_TEMP_COLUMN_1TVRW0X3UZ" >= 0 :: INT) AND ("SNOWPARK_TEMP_COLUMN_1TVRW0X3UZ" < 800000 :: INT))

SnowparkSQLException: (1304): 01aa9f79-0402-9b75-002a-c9030e12f396: 000904 (42000): SQL compilation error: error line 1 at position 69
invalid identifier '" CAST ((row_number() OVER (  ORDER BY "C1" ASC NULLS FIRST ) - 1 :: INT) AS FLOAT)"'