In [16]:
# Snowpark for Python
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import DecimalType

# Snowflake ML
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.pipeline import *
from snowflake.ml.modeling.metrics.correlation import correlation

In [6]:
# Data Science Libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Misc
import json
import joblib

# warning suppresion
import warnings; warnings.simplefilter('ignore')

In [None]:
from snowflake.snowpark import Session
#import snowflake.snowpark

def initiateSession():

    with open('creds.json') as f:
        connection_parameters = json.load(f)

    session = Session.builder.configs(connection_parameters).create()
    return session

session = initiateSession()

In [8]:
from snowflake.snowpark.types import DoubleType
session.query_tag = {"origin":"sf_sit-is", 
                     "name":"seunj"}

In [9]:
# Create a Snowpark DataFrame that is configured to load data from the CSV file
# We can now infer schema from CSV files.
diamonds_df = session.read.options({"field_delimiter": ",",
                                    "field_optionally_enclosed_by": '"',
                                    "infer_schema": True,
                                    "parse_header": True}).csv("@DIAMONDS_ASSETS")

diamonds_df

<snowflake.snowpark.dataframe.DataFrame at 0x1d16238a980>

In [12]:
# Force headers to uppercase
for colname in diamonds_df.columns:
    if colname == '"table"':
       new_colname = "TABLE_PCT"
    else:
        new_colname = str.upper(colname)
    diamonds_df = diamonds_df.with_column_renamed(colname, new_colname)

diamonds_df.columns

['CARAT',
 'CUT',
 'COLOR',
 'CLARITY',
 'DEPTH',
 'TABLE_PCT',
 'PRICE',
 'X',
 'Y',
 'Z']

In [13]:
def fix_values(columnn):
    return F.upper(F.regexp_replace(F.col(columnn), '[^a-zA-Z0-9]+', '_'))

for col in ["CUT"]:
    diamonds_df = diamonds_df.with_column(col, fix_values(col))

list(diamonds_df.schema)

[StructField('CARAT', DecimalType(3, 2), nullable=True),
 StructField('COLOR', StringType(), nullable=True),
 StructField('CLARITY', StringType(), nullable=True),
 StructField('DEPTH', DecimalType(3, 1), nullable=True),
 StructField('TABLE_PCT', DecimalType(3, 1), nullable=True),
 StructField('PRICE', LongType(), nullable=True),
 StructField('X', DecimalType(4, 2), nullable=True),
 StructField('Y', DecimalType(4, 2), nullable=True),
 StructField('Z', DecimalType(4, 2), nullable=True),
 StructField('CUT', StringType(), nullable=True)]

In [14]:
for colname in ["CARAT", "X", "Y", "Z", "DEPTH", "TABLE_PCT"]:
    diamonds_df = diamonds_df.with_column(colname, diamonds_df[colname].cast(DoubleType()))

diamonds_df.write.mode('overwrite').save_as_table('diamonds')

Let's use the MinMaxScaler to normalize the CARAT column.

In [17]:
from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder

ImportError: cannot import name 'MinMaxScaler' from 'snowflake.ml.modeling.preprocessing' (c:\Users\seunj\source\snowpark_things\venv\lib\site-packages\snowflake\ml\modeling\preprocessing\__init__.py)

In [None]:
# Normalize the CARAT column
snowml_mms = snowml.

snowml_mms = snowml.MinMaxScaler(input_cols=["CARAT"], output_cols=["CARAT_NORM"])
normalized_diamonds_df = snowml_mms.fit(diamonds_df).transform(diamonds_df)

# Reduce the number of decimals
new_col = normalized_diamonds_df.col("CARAT_NORM").cast(DecimalType(7, 6))
normalized_diamonds_df = normalized_diamonds_df.with_column("CARAT_NORM", new_col)

normalized_diamonds_df

AttributeError: module 'snowflake.ml.modeling.preprocessing' has no attribute 'MinMaxScaler'