# Diamonds Model
This notebook creates a pipeline for feature transformations and a hyperparameter search using an XGBoostRegressor estimator. The model is then logged to a Snowflake Model Registry.

In [None]:
# Snowpark ML
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.pipeline import Pipeline

from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.registry import Registry
from snowflake.ml._internal.utils import identifier
from snowflake.ml.modeling.metrics import mean_absolute_percentage_error

# Data Science Libs
import numpy as np

# warning suppresion
import warnings; warnings.simplefilter('ignore')

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session

In [None]:
session = get_active_session()
session.sql_simplifier_enabled = True

In [None]:
session.use_schema('DATA')
diamonds_df = session.table('DIAMONDS')

In [None]:
# train test split
diamonds_train_df, diamonds_test_df = diamonds_df.random_split(weights=[0.9, 0.1], seed=0)

In [None]:
# Categorize all the features for processing
CATEGORICAL_COLUMNS = ["CUT", "COLOR", "CLARITY"]
NUMERICAL_COLUMNS = ["CARAT", "DEPTH", "TABLE_PCT", "X", "Y", "Z"]

LABEL_COLUMNS = ['PRICE']
OUTPUT_COLUMNS = ['PREDICTED_PRICE']

categories = {
    "CUT": np.array(["Ideal", "Premium", "Very Good", "Good", "Fair"]),
    "CLARITY": np.array(["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1", "I2", "I3"]),
    "COLOR": np.array(['D', 'E', 'F', 'G', 'H', 'I', 'J']),
}

In [None]:
# define grid search
grid_search = GridSearchCV(
    estimator=XGBRegressor(),
    param_grid={
        "n_estimators":[100, 200, 300, 400, 500],
        "learning_rate":[0.1, 0.2, 0.3, 0.4, 0.5],
    },
    n_jobs = -1,
    scoring="neg_mean_absolute_percentage_error",
    input_cols=CATEGORICAL_COLUMNS+NUMERICAL_COLUMNS,
    label_cols=LABEL_COLUMNS,
    output_cols=OUTPUT_COLUMNS
)


In [None]:
# Build the pipeline
training_pipeline = Pipeline(
    steps=[
            (
                "OE",
                snowml.OrdinalEncoder(
                    input_cols=CATEGORICAL_COLUMNS,
                    output_cols=CATEGORICAL_COLUMNS,
                    categories=categories,
                )
            ),
            (
                "MMS",
                snowml.MinMaxScaler(
                    clip=True,
                    input_cols=NUMERICAL_COLUMNS,
                    output_cols=NUMERICAL_COLUMNS,
                )
            ),
            (
                "GRID_SEARCH",
                grid_search
                    
            )
    ], 
)

In [None]:
alter warehouse {{session.get_current_warehouse()}} set warehouse_size=LARGE; 

In [None]:
_ = training_pipeline.fit(diamonds_train_df)

In [None]:
alter warehouse {{session.get_current_warehouse()}} set warehouse_size=XSMALL;

In [None]:
training_pipeline.to_sklearn()['GRID_SEARCH']

In [None]:
grid_search_step = training_pipeline.to_sklearn()['GRID_SEARCH']
optimal_model = grid_search_step.best_estimator_
optimal_n_estimators = optimal_model.n_estimators
optimal_learning_rate = optimal_model.learning_rate

In [None]:
# Predict
result = training_pipeline.predict(diamonds_test_df)

# Analyze results
mape = mean_absolute_percentage_error(df=result, 
                                        y_true_col_names="PRICE", 
                                        y_pred_col_names="PREDICTED_PRICE")

result.select("PRICE", "PREDICTED_PRICE").show()
print(f"Mean absolute percentage error: {mape}")

## Log model to model registry

In [None]:
model_name = "DIAMONDS_PRICE_PREDICTION"

db = identifier._get_unescaped_name(session.get_current_database())
schema = 'MODEL_REGISTRY'
native_registry = Registry(session=session, database_name=db, schema_name=schema)

In [None]:
# create sample input
# Get sample input data to pass into the registry logging function
X = diamonds_train_df.drop(LABEL_COLUMNS).limit(100).to_pandas()

for c in ['CUT','COLOR','CLARITY']:
    X[c] = X[c].astype('category')

In [None]:
model_ver = native_registry.log_model(
    model_name=model_name,
    model=training_pipeline,
    sample_input_data=X, # to provide the feature schema,
)

In [None]:
model_ver.comment = "Diamonds prediction model. Full pipeline logged."
model_ver.set_metric(metric_name="mean_abs_pct_err", value=mape)
model_ver.set_metric(metric_name="n_estimators", value=optimal_n_estimators)
model_ver.set_metric(metric_name="learning_rate", value=optimal_learning_rate)

In [None]:
native_registry.get_model(model_name).show_versions()

In [None]:
# optional: set the new version of the model as default
native_registry.get_model(model_name).default = model_ver.version_name

In [None]:
native_registry.get_model(model_name).show_versions()