In [None]:
# Import python packages
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from snowflake.ml.registry import Registry

#add another package
# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()



In [None]:
titanic = pd.read_csv('data/titanic_snowflake.csv')
titanic = titanic.drop(["AGE", 
                        "DECK", 
                        "ALIVE",
                        "ADULT_MALE",
                        "EMBARKED",
                        "PCLASS",
                        "ALONE",
                        "SEX"],axis=1)
titanic.head()
     

Usually your data will already be in Snowflake.  This next step shows how to write the pandas dataframe as a table, then how to turn a table from Snowflake into a pandas dataframe

In [None]:
# This step turns pandas -> snowpark and writes to snowflake
titanic_sf = session.create_dataframe(titanic)
titanic_sf.write.mode("overwrite").save_as_table("titanic_raw")

In [None]:
# Here we read a table from Snowflake into a Snowpark dataframe

titanic_raw = session.table('titanic_raw').to_pandas()
titanic_raw.head()

In [None]:
# Define X and y
X = titanic.drop('SURVIVED', axis=1)
y = titanic['SURVIVED']

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
boolean_cols = X.select_dtypes(include=['bool']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.difference(boolean_cols).tolist()

# Boolean to int transformer
bool_to_int = FunctionTransformer(lambda df: df.astype(int), validate=False)

# Preprocessing for numeric, categorical, and boolean
numeric_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

boolean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('to_int', bool_to_int)
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols),
    ('bool', boolean_transformer, boolean_cols)
])

# Final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(objective='binary:logistic', eval_metric='logloss'))
])

# Parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.1, 0.5],
    'classifier__max_depth': [1, 2, 3, 4, 5, 6],
    'classifier__min_child_weight': [1, 6]
}

# Split data
xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, random_state=1234)

# Grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid_search.fit(xtrain, ytrain)

In [None]:
# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(xtest, ytest)
print("Test Score:", test_score)

In [None]:
metrics = {
    "Accuracy": best_score,
    "Params": best_params
}

metrics

In [None]:
# Get sample input data to pass into the registry logging function
X = xtrain.sample(n=1)

# Create a registry and log the model
# You can specify a different DB and Schema if you'd like
# otherwise it uses the session context
# If a registry does not exist it will create one
reg = Registry(session=session)

# Define model name and version (use uppercase for name)
model_name = "TITANIC_SERVICE_PIPE"

titanic_model = reg.log_model(
    model_name=model_name,
    target_platforms=["SNOWPARK_CONTAINER_SERVICES"],# Deploying online so SPCS
    #version_name="V_1", # If you leave version_name off SF creates one
    model=best_model,
    sample_input_data=X,
    metrics=metrics,
)

In [None]:
models_df = reg.show_models()
models_df[models_df['name'] == model_name]

In [None]:
models = reg.get_model(model_name).show_versions()
models.sort_values(by='created_on', ascending=False)

In [None]:
recent_model = reg.get_model(model_name).last()
recent_model

In [None]:
m = reg.get_model(model_name).last()
m.default = m
mv = m.default
mv.version_name

In [None]:
-- If you do not have an image repo create on
CREATE IMAGE REPOSITORY IF NOT EXISTS tutorial_repository;

### Deploying a Model to Snowpark Container Services as a Long-Running Service

This section explains how to deploy a machine learning model to Snowpark Container Services (SPCS) using Model Serving. The deployed service will run continuously and expose a REST API endpoint for prediction.

If you're currently using `system_compute_pool_CPU`, you will need to create a separate compute pool to host the service. 

> ### You may need `SYSADMIN` privileges to create a compute pool.

```sql
CREATE COMPUTE POOL compute_pool_name
  MIN_NODES = 1
  MAX_NODES = 1
  INSTANCE_FAMILY = CPU_X64_XS;
'''

In [None]:
-- If you do not have a compute pool create one
CREATE COMPUTE POOL IF NOT EXISTS titanic_compute_pool
  MIN_NODES = 1
  MAX_NODES = 2
  INSTANCE_FAMILY = CPU_X64_M;

In [None]:
image_repo_name = "tutorial_repository"

cp_name = "titanic_compute_pool"
num_spcs_nodes = '1'
service_name = 'TITANIC_PIPE_PREDICTION_SERVICE'

current_database = session.get_current_database().replace('"', '')
current_schema = session.get_current_schema().replace('"', '')
extended_image_repo_name = f"{current_database}.{current_schema}.{image_repo_name}"
extended_service_name = f'{current_database}.{current_schema}.{service_name}'

In [None]:
DROP SERVICE IF EXISTS {{service_name}};

In [None]:
# This step may take a few minutes
mv.create_service(
    service_name=extended_service_name,
    service_compute_pool=cp_name,
    image_repo=extended_image_repo_name,
    ingress_enabled=True,
    max_instances=int(num_spcs_nodes),
    build_external_access_integration="ALLOW_ALL_INTEGRATION"
)

In [None]:
-- Show the compute pool has a service
describe compute pool titanic_compute_pool;

In [None]:
SHOW SERVICES LIKE '%TITANIC_PIPE_PREDICTION_SERVICE%';

In [None]:
# Can also view this in the Model Registry UI
# Give this a minute for the inference endpoint to populate
mv.list_services()

In [None]:
test_sf = session.create_dataframe(X.reset_index()).drop('"index"')

In [None]:
mv.run(test_sf, 
            function_name = "PREDICT", 
            service_name = "TITANIC_PIPE_PREDICTION_SERVICE")

## Take streamlit_app.py, copy and paste in a SiS app and watch the model run in miliseconds

Since we created a REST API above, this service will run continuously. It is a good idea to drop or suspend the service if you do not need it. Compute pool will automatically suspend if no service is running.

## Make sure to stop the service at the end of the demo so it does not stay on

In [None]:
-- ALTER SERVICE {{service_name}} SUSPEND;