### End to End ML In Snowflake

1. Data Generation 
2. EDA
3.Feature Engineering 
4.Use Feature Store to track engineered features
    - Store feature defintions in feature store for reproducible computation of ML features
2. Train two SnowML Models and multiple sciktlearn model 
    - Xgboost with tree booster
    - Xgboost with linear booster
3. Register both models in Snowflake model registry
    - Explore model registry capabilities such as metadata tracking, inference, and explainability
4. Set up Model Monitor to track 1 year of predicted and actual loan repayments
    - Compute performance metrics such a F1, Precision, Recall
    - Inspect model drift (i.e. how much has the average predicted repayment rate changed day-to-day)
    - Compare models side-by-side to understand which model should be used in production
    - Identify and understand data issues
5. Track data and model lineage throughout
    - View and understand
        * The origin of the data used for computed features
        * The data used for model training
        * The available model versions being monitored
6. Create a App leveraging the prediction to find answers for business users 




In [None]:
-- CREATE OR REPLACE NETWORK RULE allow_all_rule
-- MODE = 'EGRESS'
-- TYPE = 'HOST_PORT'
-- VALUE_LIST = ('0.0.0.0:443','0.0.0.0:80');

-- CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION allow_all_integration
-- ALLOWED_NETWORK_RULES = (allow_all_rule)
-- ENABLED = true;

-- GRANT USAGE ON INTEGRATION allow_all_integration TO ROLE PUBLIC;

In [None]:
#!pip install shap

In [None]:
# Standard Python Libraries
import sys
import json
import warnings
from datetime import timedelta

# Data Manipulation and Analysis
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from snowflake.ml.modeling.xgboost import XGBRegressor, XGBClassifier

# Snowpark Core
from snowflake.snowpark import Session, DataFrame
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark.version import VERSION
import snowflake.snowpark.functions as F
from snowflake.snowpark.exceptions import SnowparkSessionException
from snowflake.snowpark.functions import (sproc, col, dayname, 
                              to_timestamp,min, max,split
)


from snowflake.snowpark import types as T
from snowflake.snowpark.window import Window

# Snowpark ML
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.preprocessing import OrdinalEncoder, OneHotEncoder
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.modeling.metrics import mean_absolute_percentage_error
from snowflake.ml.registry import Registry

# Snowflake Feature Store
from snowflake.ml.feature_store import (
    FeatureStore, FeatureView, Entity, CreationMode, setup_feature_store
)

# Snowflake Task API
from snowflake.core import Root
from snowflake.core.database import Database
from snowflake.core.schema import Schema
from snowflake.core.warehouse import Warehouse
from snowflake.core.task import StoredProcedureCall
from snowflake.core.task.dagv1 import DAG, DAGTask, DAGOperation
from snowflake.core._common import CreateMode

# Streamlit
import streamlit as st

# Suppress warnings
warnings.filterwarnings("ignore")


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create the Model Registry and register your initial model
from snowflake.ml.registry import Registry


In [None]:
# Create Snowflake Session object
session = get_active_session()
session.sql_simplifier_enabled = True

snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

Data Generation Script TBD

- Do some basics EDA 

In [None]:
select * from transactions limit 2;

In [None]:
select * from customer_complaints limit 2;

In [None]:
--drop table fraud_analysis;

Merge two datasets

In [None]:
# Create the table using SQL
session.sql("""
CREATE OR REPLACE TABLE fraud_analysis AS
SELECT 
    t.transaction_id, 
    t.customer_id, 
    t.transaction_amount, 
    t.is_fraud, 
    t.merchant_category,
    t.device_type,
    t.location,
    t.transaction_time,
    c.complaint_text, 
    c.keywords,
    c.complaint_time
FROM transactions t
LEFT JOIN customer_complaints c
ON t.customer_id = c.customer_id
""").collect()

# Create a Snowpark DataFrame from the newly created table
df = session.table("fraud_analysis")


In [None]:
select * from fraud_analysis limit 2;

In [None]:
df.show(2)

- lets do feature engineering 

In [None]:
-- ALTER TABLE fraud_analysis 
-- ADD COLUMN computed_sentiment STRING;
-- UPDATE fraud_analysis 
-- SET computed_sentiment = SNOWFLAKE.CORTEX.SENTIMENT(complaint_text);
-- SELECT complaint_text, computed_sentiment 
-- FROM fraud_analysis 
-- LIMIT 10;

### Create features with Feature Store

Initialize Feature Store
Let's first create a feature store client. With CREATE_IF_NOT_EXIST mode, it will try to create a new Feature Store schema and all necessary feature store metadata if it doesn't exist already. It is required for the first time to set up a Feature Store. Afterwards, you can use FAIL_IF_NOT_EXIST mode to connect to an existing Feature Store.

Note that the database being used must already exist. Feature Store will NOT try to create the database even in CREATE_IF_NOT_EXIST mode.
Generate cumulative behavioral metrics for users based on their transaction data, such as cumulative clicks and cumulative logins per hour. It involves the use of window functions and joins to combine and transform data from the CREDITCARD_TRANSACTIONS table.

In [None]:
df.select(min('TRANSACTION_TIME'), max('TRANSACTION_TIME'))


In [None]:
#Create a dict with keys for feature names and values containing transform code
from snowflake.snowpark.functions import call_udf
feature_eng_dict = dict()
feature_eng_dict["TRANSACTION_TIME"] = to_timestamp("TRANSACTION_TIME")
feature_eng_dict["SENTIMENT_SCORE"] = call_udf("SNOWFLAKE.CORTEX.SENTIMENT", col("complaint_text"))
feature_eng_dict["TRANSACTION_DAY"] = dayname(col("transaction_time"))
df = df.with_columns(feature_eng_dict.keys(), feature_eng_dict.values())

In [None]:
df.show(2)

In [None]:
df.explain()

Creating Entities

An entity is an abstraction over a set of primary keys used for looking up feature data. An Entity represents a real-world "thing" that has data associated with it. Below cell registers an entity for Customer and Transaction in Feature Store

In [None]:
fs = FeatureStore(
    session=session, 
    database=session.get_current_database(), 
    name=session.get_current_schema(), 
    default_warehouse=session.get_current_warehouse(),
    creation_mode=CreationMode.CREATE_IF_NOT_EXIST
)

In [None]:
fs.list_entities()

In [None]:
# First try to retrieve an existing entity definition, if not define a new one and register
try:
    # Retrieve existing entity
    customer_entity = fs.get_entity('CUSTOMER_ID_ENTITY') 
    print('Retrieved existing entity')
except:
    # Define new entity
    customer_entity = Entity(
        name = "CUSTOMER_ID_ENTITY",
        join_keys = ["CUSTOMER_ID"],
        desc = "Features defined on a per customer level")
    
    # Register
    fs.register_entity(customer_entity)
    print("Registered new entity")


In [None]:
fs.list_entities()

In [None]:
#Create a dataframe with just the ID, timestamp, and engineered features. We will use this to define our feature view
feature_df = df.select(["CUSTOMER_ID"]+list(feature_eng_dict.keys()))
feature_df.show(5)

# Using Feature Views

A feature view is a group of logically-related features that are refreshed on the same schedule. The FeatureView constructor accepts a Snowpark DataFrame that contains the feature generation logic. The provided DataFrame must contain the join_keys columns specified in the entities associated with the feature view. In this example we are using time-series data, so we will also specify the timestamp column name.

Below cell creates a feature view for the customer features


In [None]:
#define and register feature view
fraud_fv = FeatureView(
    name="FRAUD_FEATURES",
    entities=[customer_entity],
    feature_df=feature_df,
    timestamp_col="TRANSACTION_TIME")

fraud_fv = fs.register_feature_view(fraud_fv, version="1", overwrite=True)

In [None]:
fraud_fv

In [None]:
fs.list_feature_views()

This completes the setup for the Database objects and Feature Store Producer workflow. The data and the features which have been generated is available for the consumer with appropritate privileges. Time to head on to the next notebook!
Generating Datasets for Training
We are now ready to generate our training set. We'll define a spine DataFrame to form the backbone of our generated dataset and pass it into FeatureStore.generate_dataset() along with our Feature Views.

NOTE: The spine serves as a request template and specifies the entities, labels and timestamps (when applicable). The feature store then attaches feature values along the spine using an AS-OF join to efficiently combine and serve the relevant, point-in-time correct feature data.

In [None]:
df.show(2)

In [None]:
ds = fs.generate_dataset(
    name="FRAUD_DETECTION_DATASET_V1",
    spine_df=df.drop("SENTIMENT_SCORE", "complaint_text",
 "TRANSACTION_DAY","KEYWORDS","COMPLAINT_TIME"),
    features=[fraud_fv],
    spine_timestamp_col="TRANSACTION_TIME",
    spine_label_cols=["IS_FRAUD"]
)

In [None]:
ds_sp = ds.read.to_snowpark_dataframe()
ds_sp.show(5)

In [None]:
from snowflake.snowpark.types import StringType
import snowflake.ml.modeling.preprocessing as snowml

# Select categorical columns (columns with StringType), excluding CUSTOMER_ID and TRANSACTION_ID
OHE_COLS = [col.name for col in ds_sp.schema.fields
            if isinstance(col.datatype, StringType)
            and col.name not in ('CUSTOMER_ID', 'TRANSACTION_ID')]

# Create output column names for one-hot encoding
OHE_POST_COLS = [i + "_OHE" for i in OHE_COLS]

# Encode categorical columns to numeric columns using OneHotEncoder
snowml_ohe = snowml.OneHotEncoder(input_cols=OHE_COLS, output_cols=OHE_POST_COLS, drop_input_cols=True)

# Fit and transform the dataset
ds_sp_ohe = snowml_ohe.fit(ds_sp).transform(ds_sp)

# Print the resulting column names
ds_sp_ohe.columns


### training the model trying snowml model 


In [None]:
train, test = ds_sp_ohe.random_split(weights=[0.70, 0.30], seed=216)

In [None]:
train = train.fillna(0)
test = test.fillna(0)

In [None]:
train.show(2)

In [None]:
test.show(2)

In [None]:
from snowflake.ml.modeling.xgboost import XGBRegressor, XGBClassifier

snow_xgb_tree = XGBClassifier(
    input_cols=train.drop(["IS_FRAUD", "TRANSACTION_TIME", "CUSTOMER_ID",'TRANSACTION_ID']).columns,
    label_cols=train.select("IS_FRAUD").columns,
    output_cols="FRAUD_PREDICTION",
    learning_rate = 0.75,
    ##tree_method="exact",
    ##n_estimators=5,
    booster = 'gbtree'
)

snow_xgb_linear = XGBClassifier(
    input_cols=train.drop(["IS_FRAUD", "TRANSACTION_TIME", "CUSTOMER_ID","TRANSACTION_ID"]).columns,
    label_cols=train.select("IS_FRAUD").columns,
    output_cols="FRAUD_PREDICTION",
    # tree_method="exact",
    # n_estimators=10,
    booster = 'gblinear'
)

In [None]:

snow_xgb_tree.fit(train)

In [None]:
snow_xgb_linear.fit(train)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
test_preds_tree = snow_xgb_tree.predict(test).select(["IS_FRAUD", "FRAUD_PREDICTION"]).to_pandas()
test_preds_linear = snow_xgb_linear.predict(test).select(["IS_FRAUD", "FRAUD_PREDICTION"]).to_pandas()

f1_tree = f1_score(test_preds_tree.IS_FRAUD, test_preds_tree.FRAUD_PREDICTION)
f1_linear = f1_score(test_preds_linear.IS_FRAUD, test_preds_linear.FRAUD_PREDICTION)


precision_tree = precision_score(test_preds_tree.IS_FRAUD, test_preds_tree.FRAUD_PREDICTION)
precision_linear = precision_score(test_preds_linear.IS_FRAUD, test_preds_linear.FRAUD_PREDICTION)

recall_tree = recall_score(test_preds_tree.IS_FRAUD, test_preds_tree.FRAUD_PREDICTION)
recall_linear = recall_score(test_preds_linear.IS_FRAUD, test_preds_linear.FRAUD_PREDICTION)


print(f'GB Tree: \n f1: {f1_tree} \n precision {precision_tree} \n recall: {recall_tree}')
print(f'GB Linear: \n f1: {f1_linear} \n precision {precision_linear} \n recall: {recall_linear}')


### Sklearn code 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
# Create the Model Registry and register your initial model
from snowflake.ml.registry import Registry


In [None]:
from sklearn.model_selection import train_test_split
ds_sp_pandas = ds_sp.to_pandas()

# Assuming 'ds_sp' is a valid pandas DataFrame
X = ds_sp_pandas[['DEVICE_TYPE', 'MERCHANT_CATEGORY', 'TRANSACTION_DAY', 
                  'TRANSACTION_AMOUNT', 'SENTIMENT_SCORE']]
y = ds_sp_pandas['IS_FRAUD']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Display shapes to verify the split
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



# Define categorical and numerical columns
categorical_features = ['DEVICE_TYPE', 'MERCHANT_CATEGORY', 'TRANSACTION_DAY']
numerical_features = ['TRANSACTION_AMOUNT', 'SENTIMENT_SCORE']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ]
)


# Define models
models = {
    "XGBoost": Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ]),
    "RandomForest": Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier())
    ]),
    "LogisticRegression": Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])
}

best_model = None
best_score = 0

for name, model in models.items():
    print(f"Training {name} model...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {score:.4f}")

    if score > best_score:
        best_model = model
        best_score = score

print(f"Best model: {type(best_model.named_steps['classifier']).__name__} with accuracy {best_score:.4f}")

# Logging the model to Model Registry
   - We will log in Xgb_tree model & sklearn model 

In [None]:
from snowflake.ml._internal.utils import identifier
#Create a snowflake model registry object 
db = identifier._get_unescaped_name(session.get_current_database())
schema = identifier._get_unescaped_name(session.get_current_schema())

model_registry = Registry(session=session, 
                    database_name=session.get_current_database(), 
                    schema_name=session.get_current_schema(),
                    options={"enable_monitoring": True})

In [None]:
#Deploy the sklearn model to the model registry
# Define model name
model_name = "FRAUD_ANALYSIS"
version_name = 'V1'

try:
    mv = model_registry.get_model(model_name).version(version_name)
    print("Found existing model version!")
except:
    print("Logging new model version...")
    mv  = model_registry.log_model(
        model_name=model_name,
        model=best_model, 
        version_name=version_name,
        sample_input_data=X_train,
        comment = "sklearn model build",
    )

In [None]:
#Deploy the tree booster model to the model registry
# Define model name
model_name = "FRAUD_ANALYSIS_XGB"
tree_version_name = 'V1'

try:
    mv_tree = model_registry.get_model(model_name).version(tree_version_name)
    print("Found existing model version!")
except:
    print("Logging new model version...")
    mv_tree = model_registry.log_model(
        model_name=model_name,
        model=snow_xgb_tree, 
        version_name=tree_version_name,
        comment = "snow ml model built off feature store using tree booster",
    )
    mv_tree.set_metric(metric_name="F1_score", value=f1_tree)
    mv_tree.set_metric(metric_name="Precision_score", value=precision_tree)
    mv_tree.set_metric(metric_name="Recall_score", value=recall_tree)

In [None]:
model_registry.show_models()

In [None]:
## model_registry.delete_model("FRAUD_ANALYSIS")

In [None]:
model_registry.get_model("FRAUD_ANALYSIS").show_versions()

In [None]:
### retrieving sklearn - frad model - version 1 
reg_model = model_registry.get_model("FRAUD_ANALYSIS").version("v1")
### retrieving sklearn - frad model - version 1 
reg_model_tree = model_registry.get_model("FRAUD_ANALYSIS_XGB").version("v1")

print(reg_model)
print(reg_model_tree)

In [None]:
print(mv)
print(mv.show_metrics())

print(mv_tree)
print(mv_tree.show_metrics())

# Batch Inferencing on warehouse

#### Two model what we have trained Sklearn & snowml lets predict using both the models 

In [None]:
reg_preds_tree = mv_tree.run(test, function_name = "predict")
reg_probs_tree_prob = mv_tree.run(test, function_name="predict_proba") 
reg_preds_tree.show(2)
reg_probs_tree_prob.show(2)

In [None]:
### Extracting customer id, transaction Id, and prediction 

reg_probs_tree_prob_complete = mv_tree.run(ds_sp_ohe, 
                            function_name="predict_proba").select("CUSTOMER_ID", 
                        "TRANSACTION_ID", "PREDICT_PROBA_0", "PREDICT_PROBA_1" ) 

reg_probs_tree_prob_complete.show(3)

In [None]:
### Earlier data set before split 
ds_sp.show(5)

In [None]:
schema = ds_sp.schema  # Correct: Access the schema as an attribute
print(schema)




In [None]:
schema = reg_probs_tree_prob_complete.schema  # Correct: Access the schema as an attribute
print(schema)


In [None]:
print("ds_sp columns:", ds_sp.columns)
print("reg_probs_tree_prob_complete columns:", reg_probs_tree_prob_complete.columns)

In [None]:
joined_df = ds_sp.join(
    reg_probs_tree_prob_complete,
    on=["CUSTOMER_ID", "TRANSACTION_ID"],
    join_type="left"
)

joined_df.show(2)
# # Save the dataframe
#joined_df.write.mode("overwrite").save_as_table("FT_PREDICTION_FINAL")

In [None]:
# # left join master dataset
# # This dataset will be used for apps and other metrics 

# joined_df = ds_sp.join(
#     reg_probs_tree_prob_complete,
#     (ds_sp["CUSTOMER_ID"] == reg_probs_tree_prob_complete["CUSTOMER_ID"]) & 
#     (ds_sp["TRANSACTION_ID"] == reg_probs_tree_prob_complete["TRANSACTION_ID"]),
#     join_type="left",
# )

# # Prioritize columns from ds_sp and select all columns without renaming
# selected_columns = [ds_sp[col] for col in ds_sp.columns] + \
#                    [reg_probs_tree_prob_complete["predict_proba_0"], reg_probs_tree_prob_complete["predict_proba_1"]]

# # Use select to build dataframe with those specified columns and save the table
# final_df = joined_df.select(*selected_columns)

# #final_df.show(2)
# # Save the dataframe
# #final_df.write.mode("overwrite").save_as_table("FT_Prediction")


In [None]:
select * from FT_Prediction limit 2; 

In [None]:
# Get column information for both tables
cols_1 = session.table("FT_Prediction").columns
cols_2 = session.table("FT_Prediction_v1").columns

# Print columns in first table but not in second
print("Columns in FT_Prediction but not in FT_Prediction_v1:")
print([col for col in cols_1 if col not in cols_2])

# Print columns in second table but not in first
print("Columns in FT_Prediction_1 but not in FT_Prediction:")
print([col for col in cols_2 if col not in cols_1])

# Print common columns
print("Common columns:")
print([col for col in cols_1 if col in cols_2])

## Batch prediction using sk learn model 

In [None]:
reg_probs_sk = mv.run(X_test, function_name="predict_proba") 
reg_preds_sk = mv.run(X_test, function_name = "predict")


In [None]:
print(reg_probs_sk.head(2))
print(reg_preds_sk.head(2))

# Real time inferencing deployment on Snowpark Container Services

In [None]:
# Define model name
model_name = "FRAUD_ANALYSIS_SPCS"
image_repo_name = "AI_ML_REPO"
cp_name = "AI_ML_CP"
num_spcs_nodes = '3'
spcs_instance_family = 'CPU_X64_L'
service_name = 'FRAUD_DETECTION_SERVICE'

In [None]:
current_database = session.get_current_database().replace('"', '')
current_schema = session.get_current_schema().replace('"', '')
extended_image_repo_name = f"{current_database}.{current_schema}.{image_repo_name}"
extended_service_name = f'{current_database}.{current_schema}.{service_name}'

In [None]:
session.sql(f"alter compute pool if exists {cp_name} stop all").collect()
session.sql(f"drop compute pool if exists {cp_name}").collect()
session.sql(f"create compute pool {cp_name} min_nodes={num_spcs_nodes} max_nodes={num_spcs_nodes} instance_family={spcs_instance_family} auto_resume=True auto_suspend_secs=300").collect()
session.sql(f"describe compute pool {cp_name}").show()

In [None]:
session.sql(f"create image repository if not exists {extended_image_repo_name}").collect()

In [None]:
mv_tree.create_service(
    service_name=extended_service_name,
    service_compute_pool=cp_name,
    image_repo=extended_image_repo_name,
    ingress_enabled=True,
    max_instances=int(num_spcs_nodes),
    build_external_access_integration="ALLOW_ALL_INTEGRATION"
)




In [None]:
mv_tree.list_services()

In [None]:
session.sql(f"SELECT VALUE:status::VARCHAR as SERVICESTATUS, VALUE:message::VARCHAR as SERVICEMESSAGE FROM TABLE(FLATTEN(input => parse_json(system$get_service_status('{service_name}')), outer => true)) f").show(100)

In [None]:
session.sql(f"show endpoints in service {service_name}").collect()[0]["ingress_url"]

In [None]:
test.limit(1).show()

## Batch  inferencing using SPCS

In [None]:
mv_tree.run(test.limit(4) , service_name=service_name, function_name="predict")


In [None]:
mv_tree.run(test.limit(4) , service_name=service_name, function_name="predict_proba")

### Feature importance and score 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Get feature names from preprocessor
feature_names = (models["RandomForest"]
                 .named_steps["preprocessor"]
                 .transformers_[0][2] +  # Numerical features
                 list(models["RandomForest"].named_steps["preprocessor"]
                      .transformers_[1][1]
                      .named_steps["encoder"]
                      .get_feature_names_out(categorical_features)))  # Encoded categorical features

# Get feature importance from the RandomForestClassifier
importances = best_model.named_steps["classifier"].feature_importances_

# Convert to DataFrame for better readability
feature_importance_df = pd.DataFrame(
    {"FEATURE": feature_names, "IMPORTANCE": importances}
).sort_values(by="IMPORTANCE", ascending=False)

# Print feature importances
#print(feature_importance_df)

snowpark_df = session.create_dataframe(feature_importance_df)

#snowpark_df
#snowpark_df.write.mode("overwrite").save_as_table("feature_importance_df")



In [None]:
select Feature from feature_importance_df;

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 5))
plt.barh(feature_importance_df["Feature"], feature_importance_df["Importance"])
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.title("Feature Importance for RandomForestClassifier Model")
plt.gca().invert_yaxis()
plt.show()

# Model explainability

In [None]:
shap_vals = mv_tree.run(test.sample(n=1000), function_name="explain")
shap_vals

In [None]:
shap_pd = shap_vals.to_pandas()
shap_pd.head(2)

In [None]:
# !pip install shap

In [None]:
import shap 
just_shap = shap_pd.iloc[:, 10:]
just_input_vals = shap_pd.iloc[:, :10].drop(["CUSTOMER_ID","IS_FRAUD", "TRANSACTION_TIME"], axis=1)

shap.summary_plot(np.array(just_shap), just_input_vals, feature_names = just_input_vals.columns)

In [None]:
shap_pd.iloc[:, 10:].mean(axis=0).sort_values(ascending=False)

In [None]:
import seaborn as sns

sns.scatterplot(data = shap_pd, x ="LOAN_PURPOSE_NAME_Home purchase", y = "LOAN_PURPOSE_NAME_Home purchase_explanation")

In [None]:
import seaborn as sns

income_0_to_1M = shap_pd[(shap_pd.INCOME>0) & (shap_pd.INCOME<1000000)]
sns.scatterplot(data = income_0_to_1M, x ="INCOME", y = "INCOME_explanation")

## Model Monitoring setup

#### do model drift 
#### observability 
### Explanability

In [None]:
SELECT *
FROM TABLE(MODEL_MONITOR_DRIFT_METRIC (
    'FRAUD_ANALYSIS',            -- The name of your model to monitor
    'accuracy_score',            -- The drift metric (e.g., accuracy_score, f1_score)
    'TRANSACTION_AMOUNT',        -- The feature you want to monitor for drift (e.g., 'TRANSACTION_AMOUNT')
    'DAY',                       -- Granularity (e.g., 'DAY', 'HOUR', 'MONTH')
    '2025-01-01',                -- Start time (in 'YYYY-MM-DD' format)
    '2025-03-01'                 -- End time (in 'YYYY-MM-DD' format)
));




In [None]:
CREATE OR REPLACE TASK MY_MODEL_TASK
    WAREHOUSE = 'FRAUD_WH'  -- Specify your warehouse
    SCHEDULE = 'USING CRON 0 0 * * * UTC'  -- Schedule it daily (adjust cron as needed)
AS
    CALL MONITOR_MODEL_PERFORMANCE();  -- Replace with your actual monitoring logic or stored procedure


In [None]:
CREATE OR REPLACE MODEL MONITOR FRAUD_ANALYSIS_MONITOR
    WITH
        MODEL = 'FRAUD_ANALYSIS'
        VERSION = 'V3'
        FUNCTION = 'PREDICT'
        SOURCE = 'TRANSACTIONS'
        WAREHOUSE = 'MY_WAREHOUSE'
        REFRESH_INTERVAL = '1 DAY'
        AGGREGATION_WINDOW = '30 DAY'
        TIMESTAMP_COLUMN = 'TRANSACTION_TIME'
        BASELINE = 'initial_model_baseline'
        ID_COLUMNS = ('CUSTOMER_ID')
        PREDICTION_CLASS_COLUMNS = ('PREDICTED_IS_FRAUD')
        ACTUAL_CLASS_COLUMNS = ('IS_FRAUD');
