# Snowflake Model Registry

Adapted from https://quickstarts.snowflake.com/guide/intro-to-feature-store/index.html#2

Uses the dataset generated from the feature store notebook, builds a predictive model and registers
the mode in Model Registry

## Prepare Snowpark Session

Create a session and set metadata

In [None]:
import pandas as pd

from snowflake.ml import dataset
from snowflake.ml.feature_store.examples.example_helper import ExampleHelper
from snowflake.snowpark.context import get_active_session
from snowflake.ml.registry import Registry
from snowflake.ml.model import task, type_hints

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
import sklearn

session = get_active_session()


# Add a query tag to the session. This helps with debugging and performance monitoring.
session.query_tag = {"origin":"sf_sit-is", "name":"aiml_notebooks_fs_overview", "version":{"major":1, "minor":0}, "attributes":{"is_quickstart":0, "source":"notebook"}}

# Set session context 
session.use_role("FEATURE_STORE_LAB_USER") 

# Print the current role, warehouse, and database/schema
print(f"role: {session.get_current_role()} | WH: {session.get_current_warehouse()} | DB.SCHEMA: {session.get_fully_qualified_current_schema()}")

In [None]:
USE DATABASE FEATURE_STORE_DATABASE;
USE SCHEMA LIVE_DEMO_SCHEMA;

In [None]:
# Print the current role, warehouse, and database/schema
print(f"role: {session.get_current_role()} | WH: {session.get_current_warehouse()} | DB.SCHEMA: {session.get_fully_qualified_current_schema()}")

## Retrieve the generated dataset

In [None]:
df_listed_datasets = session.sql("SHOW DATASETS").to_pandas()
df_listed_datasets['last_version'] = df_listed_datasets['"versions"'].apply(lambda x: eval(x)[-1])
df_listed_datasets

In [None]:
df_listed_datasets[['"name"', '"database_name"', '"schema_name"', 'last_version']]

In [None]:
ds = dataset.load_dataset(session, "TRIP_DURATION_DS", "V_001")
df_ds = ds.read.to_pandas()
df_ds.info()

In [None]:
df_target = session.sql("select airport_zip_code, tail_number, scheduled_departure_utc, departing_delay from us_flight_schedules").to_pandas()
df_target.SCHEDULED_DEPARTURE_UTC = df_target.SCHEDULED_DEPARTURE_UTC.dt.tz_localize('UTC')  # needed for merge
df_target.info()

In [None]:
df_ds = df_ds.merge(df_target, how='inner', on=['AIRPORT_ZIP_CODE', 'TAIL_NUMBER', 'SCHEDULED_DEPARTURE_UTC']) \
        .sort_values(by='SCHEDULED_DEPARTURE_UTC')  # Sorting critical to get train before valid
df_ds.info()

## Build the model

In [None]:
split_pos = int(len(df_ds) * 0.80)  # Train is 80% of earliest flights
df_train = df_ds.iloc[:split_pos]
df_valid = df_ds.iloc[split_pos:]

In [None]:
features_numeric = ['TICKETS_SOLD', 'SEATING_CAPACITY', 'MAX_RANGE_KM', 'RAIN_SUM_30M', 'RAIN_SUM_60M']
features_categorical = ['PLANE_MODEL', 'AIRPORT_ZIP_CODE']
features_inference = features_numeric + features_categorical

In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder()),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_numeric),
        ('cat', categorical_transformer, features_categorical)
    ]
)

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=99, n_estimators=100))
])

In [None]:
clf.fit(df_train[features_inference],  # Important you only pass features needed in the model
        df_train['DEPARTING_DELAY'])

## Score Valid Predictions

In [None]:
predictions = clf.predict_proba(df_valid.drop(columns=['DEPARTING_DELAY']))[:, 1]

In [None]:
valid_auc_score = roc_auc_score(df_valid['DEPARTING_DELAY'], predictions)
valid_auc_score

## Show feature importance

In [None]:
# Step 1: Get preprocessor and classifier from pipeline
preprocessor = clf.named_steps['preprocessor']
rf_model = clf.named_steps['classifier']

# Step 2: Get feature names after preprocessing
# For numeric columns (no name change)
numeric_features_names = features_numeric

# For categorical columns (get names from onehot)
cat_encoder = preprocessor.named_transformers_['cat'].named_steps['ordinal']
categorical_feature_names = cat_encoder.get_feature_names_out(features_categorical)

# Combine all feature names
all_feature_names = list(numeric_features_names) + list(categorical_feature_names)

# Step 3: Pair them with importances
feature_importances = rf_model.feature_importances_
importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance': feature_importances
}).sort_values(by='importance', ascending=False)

In [None]:
importance_df

## Register the model

In [None]:
# Accesses Model Registry
reg = Registry(session=session,
               database_name=session.get_current_database(),
               schema_name=session.get_current_schema())

In [None]:
clf.feature_names_in_

In [None]:
mv = reg.log_model(clf,
                   model_name="DELAY_PREDICTION",
                   version_name="v2",
                   conda_dependencies=[f"scikit-learn=={sklearn.__version__}"],
                   comment="My first airline delay model",
                   metrics={"auc_score": valid_auc_score},
                   sample_input_data=df_train[features_inference].iloc[:100],  # Only include features required for inference
                   task=task.Task.TABULAR_BINARY_CLASSIFICATION)

## Retrieve the model from model registry and score on dataset directly
### Shows minimal work between dataset and model inference

In [None]:
model = reg.get_model('DELAY_PREDICTION')
model.versions()

In [None]:
model_version = model.version('V2')
model_version

In [None]:
valid_directly_from_ds = ds.read.to_pandas().sort_values(by='SCHEDULED_DEPARTURE_UTC').iloc[split_pos:]
predictions_from_mr = model_version.run(valid_directly_from_ds, function_name='predict_proba')['output_feature_1']

In [None]:
roc_auc_score(df_valid['DEPARTING_DELAY'], predictions_from_mr)