In [155]:
!pip install snowflake-ml-python
!pip install snowflake-snowpark-python
!pip install snowflake-connector-python



In [187]:
import snowflake.connector
from snowflake.snowpark import Session
from credentials import params
from snowflake.snowpark.functions import col
from snowflake.ml.modeling import preprocessing
from snowflake.ml.modeling.preprocessing import LabelEncoder
from snowflake.ml.modeling.xgboost import XGBClassifier


In [188]:
# Inspect installed snowflake-telemetry-python version and optionally install a required version
def installed_pkg_version(pkg_name):
    try:
        from importlib.metadata import version as _version
        return _version(pkg_name)
    except Exception:
        try:
            import pkg_resources
            return pkg_resources.get_distribution(pkg_name).version
        except Exception:
            return None

pkg = 'snowflake-telemetry-python'
ver = installed_pkg_version(pkg)
print(f'Installed {pkg} version: {ver}')
# If you need to install a specific version, uncomment and set REQUIRED_VERSION below
# REQUIRED_VERSION = '0.8.0'  # <-- replace with required version from server/UDF error message
# if REQUIRED_VERSION and ver != REQUIRED_VERSION:
#     import sys; __import__('subprocess').check_call([sys.executable, '-m', 'pip', 'install', f'snowflake-telemetry-python=={REQUIRED_VERSION}'])
#     print('Re-installed, please restart the kernel if necessary')

Installed snowflake-telemetry-python version: 0.7.1


In [189]:
# Create a Snowflake session
session = Session.builder.configs(params).create()

In [190]:
month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
pre = {}

In [191]:
for i,month_length in enumerate(month_days):
    month = i + 1

    for day in range(1,month_length+1):
        
        # In January, it goes to neighborhood 1 on Mondays, and neighborhood 2 the other days.
        if ((month) == 1):
            if (day) % 7 == 1:
                pre[(month,day)] = 1
            else:
                pre[(month,day)] = 2
                
        # From February through November, it goes to neighborhood 1 on the 1st, 2 on the 2nd, 3 on the 3rd,
        # 4 on the 4th, 5 on the 5th, 6 on the 6th, and 7 on the 7th, 1 on the 8th, 2 on the 9th, etc.
        elif ((month) <= 11):
            pre[(month,day)] = ((day-1) % 7) + 1

        # Every December, it only goes to neighborhood 8.
        elif ((month) == 12):
            pre[(month,day)] = 8
print(pre)  

{(1, 1): 1, (1, 2): 2, (1, 3): 2, (1, 4): 2, (1, 5): 2, (1, 6): 2, (1, 7): 2, (1, 8): 1, (1, 9): 2, (1, 10): 2, (1, 11): 2, (1, 12): 2, (1, 13): 2, (1, 14): 2, (1, 15): 1, (1, 16): 2, (1, 17): 2, (1, 18): 2, (1, 19): 2, (1, 20): 2, (1, 21): 2, (1, 22): 1, (1, 23): 2, (1, 24): 2, (1, 25): 2, (1, 26): 2, (1, 27): 2, (1, 28): 2, (1, 29): 1, (1, 30): 2, (1, 31): 2, (2, 1): 1, (2, 2): 2, (2, 3): 3, (2, 4): 4, (2, 5): 5, (2, 6): 6, (2, 7): 7, (2, 8): 1, (2, 9): 2, (2, 10): 3, (2, 11): 4, (2, 12): 5, (2, 13): 6, (2, 14): 7, (2, 15): 1, (2, 16): 2, (2, 17): 3, (2, 18): 4, (2, 19): 5, (2, 20): 6, (2, 21): 7, (2, 22): 1, (2, 23): 2, (2, 24): 3, (2, 25): 4, (2, 26): 5, (2, 27): 6, (2, 28): 7, (3, 1): 1, (3, 2): 2, (3, 3): 3, (3, 4): 4, (3, 5): 5, (3, 6): 6, (3, 7): 7, (3, 8): 1, (3, 9): 2, (3, 10): 3, (3, 11): 4, (3, 12): 5, (3, 13): 6, (3, 14): 7, (3, 15): 1, (3, 16): 2, (3, 17): 3, (3, 18): 4, (3, 19): 5, (3, 20): 6, (3, 21): 7, (3, 22): 1, (3, 23): 2, (3, 24): 3, (3, 25): 4, (3, 26): 5, (3, 27

In [192]:
import pandas as pd
df = pd.DataFrame(list(pre.items()), columns = ['MD', 'N']) 

In [193]:
df[['col1_a', 'col1_b']] = pd.DataFrame(df['MD'].tolist(), index=df.index)
df=df.drop(columns=['MD'])
df=df[['col1_a', 'col1_b', 'N']]
df.columns = ['MONTH', 'DAY', 'NEIGHBORHOOD']
print(df)

     MONTH  DAY  NEIGHBORHOOD
0        1    1             1
1        1    2             2
2        1    3             2
3        1    4             2
4        1    5             2
..     ...  ...           ...
360     12   27             8
361     12   28             8
362     12   29             8
363     12   30             8
364     12   31             8

[365 rows x 3 columns]


In [194]:
session = Session.builder.configs(params).create()

In [195]:
session.sql(f"USE DATABASE TEST_DATABASE").collect()
session.sql(f"USE SCHEMA TEST_SCHEMA").collect()
# Create a Snowpark DataFrame from the pandas DataFrame using the session instance
snowpark_df = session.create_dataframe(df)  

In [None]:
# Ensure MONTH and DAY are integers and NEIGHBORHOOD is numeric for modeling
from snowflake.snowpark.types import IntegerType
# If columns are strings (with quotes), cast them cleanly
# Use with_column to cast; replace column names if they include quotes
def naked(col_name):
    return col_name.strip().strip('"').strip('').strip('`')
# Build a mapping from cleaned->actual column names
col_map = {c.strip(): c for c in snowpark_df.columns}
month_col = next((c for c in snowpark_df.columns if 'MONTH' in c.upper()), None)
day_col = next((c for c in snowpark_df.columns if 'DAY' in c.upper()), None)
label_col = next((c for c in snowpark_df.columns if 'NEIGHBORHOOD' in c.upper()), None)
print('Detected columns ->', month_col, day_col, label_col)
if month_col is None or day_col is None or label_col is None:
    raise RuntimeError('Required columns not found for casting: MONTH, DAY, NEIGHBORHOOD')
# Cast columns to Integer if needed
snowpark_df = snowpark_df.with_column(month_col, snowpark_df[month_col].cast(IntegerType()))
snowpark_df = snowpark_df.with_column(day_col, snowpark_df[day_col].cast(IntegerType()))
# Ensure label is integer (or numeric); if it's string, you can label-encode later
try:
    snowpark_df = snowpark_df.with_column(label_col, snowpark_df[label_col].cast(IntegerType()))
except Exception:
    # leave as-is if casting fails; LabelEncoder will handle categorical mapping
    pass
# Show a sample
snowpark_df.show(10)
# Create train/test splits
train_snowpark_df, test_snowpark_df = snowpark_df.randomSplit([0.9, 0.1])
print('Train rows:', train_snowpark_df.count(), 'Test rows:', test_snowpark_df.count())

In [196]:
snowpark_df.write.save_as_table('TEST_DATABASE.TEST_SCHEMA.df_clean', mode='overwrite')

In [198]:
snowpark_df = session.table("test_database.test_schema.df_clean")

In [199]:
# show the first forty rows of the dataframe
snowpark_df.show(n=40)

------------------------------------
|"MONTH"  |"DAY"  |"NEIGHBORHOOD"  |
------------------------------------
|1        |1      |1               |
|1        |2      |2               |
|1        |3      |2               |
|1        |4      |2               |
|1        |5      |2               |
|1        |6      |2               |
|1        |7      |2               |
|1        |8      |1               |
|1        |9      |2               |
|1        |10     |2               |
|1        |11     |2               |
|1        |12     |2               |
|1        |13     |2               |
|1        |14     |2               |
|1        |15     |1               |
|1        |16     |2               |
|1        |17     |2               |
|1        |18     |2               |
|1        |19     |2               |
|1        |20     |2               |
|1        |21     |2               |
|1        |22     |1               |
|1        |23     |2               |
|1        |24     |2               |
|

In [200]:
# count the rows in the dataframe
snowpark_df.count()


365

In [201]:
# describe the dataframe
snowpark_df.describe().show()

--------------------------------------------------------------------------
|"SUMMARY"  |"MONTH"             |"DAY"              |"NEIGHBORHOOD"     |
--------------------------------------------------------------------------
|mean       |6.526027            |15.720548          |4.019178           |
|min        |1.0                 |1.0                |1.0                |
|count      |365.0               |365.0              |365.0              |
|stddev     |3.4525841046960752  |8.808321463252803  |2.276773374756478  |
|max        |12.0                |31.0               |8.0                |
--------------------------------------------------------------------------



In [202]:
# groupby neighborhood, and show the counts
snowpark_df.group_by("Neighborhood").count().show()

----------------------------
|"NEIGHBORHOOD"  |"COUNT"  |
----------------------------
|1               |54       |
|2               |75       |
|3               |45       |
|5               |40       |
|6               |40       |
|8               |31       |
|4               |40       |
|7               |40       |
----------------------------



In [203]:
# one way to scale your target (neighborhood) so you can use it in the XGBClassifier model
test = snowpark_df.withColumn('NEIGHBORHOOD2', snowpark_df.neighborhood - 1).drop("Neighborhood")

In [204]:
# now use scikit-learn's LabelEncoder -- a more general solution -- through Snowpark ML 
le = LabelEncoder(input_cols=['NEIGHBORHOOD'], output_cols= ['NEIGHBORHOOD2'], drop_input_cols=True)

In [205]:
# apply the LabelEncoder
fitted = le.fit(snowpark_df.select("NEIGHBORHOOD"))

In [206]:
snowpark_df_prepared = fitted.transform(snowpark_df)

snowpark_df_prepared.show()


-------------------------------------
|"NEIGHBORHOOD2"  |"MONTH"  |"DAY"  |
-------------------------------------
|0.0              |1        |1      |
|1.0              |1        |2      |
|1.0              |1        |3      |
|1.0              |1        |4      |
|1.0              |1        |5      |
|1.0              |1        |6      |
|1.0              |1        |7      |
|0.0              |1        |8      |
|1.0              |1        |9      |
|1.0              |1        |10     |
-------------------------------------



In [207]:
train_snowpark_df, test_snowpark_df = snowpark_df.randomSplit([0.9, 0.1])

In [208]:
# save training data
train_snowpark_df.write.mode("overwrite").save_as_table("df_clean_train")

In [209]:
# save test data
test_snowpark_df.write.mode("overwrite").save_as_table("df_clean_test")

In [210]:
# create and train the XGBClassifier model
FEATURE_COLS = ["MONTH", "DAY"]
LABEL_COLS = ["NEIGHBORHOOD"]

In [211]:
# Train an XGBoost model on snowflake.
xgboost_model = XGBClassifier(
    
    input_cols=FEATURE_COLS,
    label_cols=LABEL_COLS
)


In [212]:
pip install snowflake-telemetry-python

Note: you may need to restart the kernel to use updated packages.


In [213]:
import snowflake.telemetry 

In [214]:
xgboost_model.fit(train_snowpark_df)

The version of package 'snowflake-telemetry-python' in the local environment is 0.7.1, which does not fit the criteria for the requirement 'snowflake-telemetry-python'. Your UDF might not work when the package version is different between the server and your local environment.
Failed to execute query [queryID: 01bf81f0-0002-29a6-0002-57760001666e] 
CREATE OR REPLACE 
TEMPORARY  PROCEDURE  SNOWPARK_TEMP_PROCEDURE_VT112DM9FZ(arg1 ARRAY,arg2 STRING,arg3 ARRAY,arg4 ARRAY,arg5 STRING,arg6 OBJECT)

RETURNS STRING
LANGUAGE PYTHON 
VOLATILE
RUNTIME_VERSION=3.11
IMPORTS=('@"TEST_DATABASE"."TEST_SCHEMA".SNOWPARK_TEMP_STAGE_FEGB27N13B/SNOWPARK_TEMP_PROCEDURE_VT112DM9FZ/udf_py_447585733.zip')

PACKAGES=('snowflake-snowpark-python','snowflake-telemetry-python','numpy==2.3.3','scikit-learn==1.6.1','xgboost==2.1.4','cloudpickle==3.1.1')



HANDLER='udf_py_447585733.compute'
EXECUTE AS OWNER



391546 (XX000): SQL compilation error: Cannot create a Python function with the specified packages. Please c

SnowparkSQLException: (1304): 01bf81f0-0002-29a6-0002-57760001666e: 391546 (XX000): SQL compilation error: Cannot create a Python function with the specified packages. Please check your packages specification and try again. 'One or more package conflicts were detected.'. Hint: These packages are available in other python runtimes:["snowflake-snowpark-python"->[3.8, 3.9, 3.10, 3.12, 3.13], "cloudpickle==3.1.1"->[3.9, 3.10, 3.12, 3.13], "snowflake-telemetry-python"->[3.8, 3.9, 3.10, 3.12], "numpy==2.3.3"->[3.12, 3.13], "xgboost==2.1.4"->[3.9, 3.10, 3.12, 3.13], "scikit-learn==1.6.1"->[3.9, 3.10, 3.12, 3.13]].

In [215]:

# check the accuracy using scikit-learn's score functionality through Snowpark ML
accuracy = xgboost_model.score(test_snowpark_df)

print("Accuracy: %.2f%%" % (accuracy * 100.0))

RuntimeError: (2102) Estimator XGBClassifier not fitted before calling score method.