# Model and Feature Data Drift in Snowflake


In [None]:
!pip install -q snowflake-ml-python==1.5.0
!pip install -q matplotlib
!pip install -q seaborn
!pip install -q evidently

In [1]:
# Import python packages
#import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
snowflake_environment = session.sql('select current_user(), current_version()').collect()
from snowflake.snowpark.version import VERSION
from snowflake.ml import version

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(VERSION[0],VERSION[1],VERSION[2]))
print('Snowflake ML version        : {}.{}.{}'.format(version.VERSION[0],version.VERSION[2],version.VERSION[4]))

In [None]:
#import warnings
#warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")

## Save metrics in the model registry

In [None]:
from snowflake.ml.registry import Registry

#get data
test = session.read.table("RAJIV.DOCAI.FRAUD_TRAINING_DATA")

#get model
reg = Registry(session=session, database_name="FRAUD_FEATURE_STORE", schema_name="FEATURE_STORE")
m = reg.get_model("FRAUD_MODEL").version("V1")

In [None]:
test.show()

Save relevant metrics to the model in the registry

In [None]:
from snowflake.ml.modeling.metrics import accuracy_score
preds = m.run(test, function_name='predict')

acc = accuracy_score(df=preds, y_true_col_names='TRANSACTION_FLAG', y_pred_col_names='"OUTPUT_TRANSACTION_FLAG"')
print("Accuracy: ",acc)

m.set_metric("Accuracy", value=acc)

In [None]:
reg.show_models()

## Let's evaluate Model Performance over time using our Feature Store

In [None]:
spine_df = session.table('FRAUD_FEATURE_STORE.FEATURE_STORE.FR_TRANSACTIONS')
spine_df.to_pandas().tail()

In [None]:
from snowflake.ml.feature_store import FeatureStore
fs = FeatureStore(
    session=session, 
    database="FRAUD_FEATURE_STORE",
    name="FEATURE_STORE",
    default_warehouse=session.get_current_warehouse(),
)

alert_fv = FeatureView = fs.get_feature_view(
    name='ALERT_FEATURES',
    version='V2'
)

account_fv = FeatureView = fs.get_feature_view(
    name='ACCOUNT_FEATURES',
    version='V2'
)

In [None]:
training_data = fs.generate_dataset(
    name='FRAUD_CLASSIFICATION',
    version='V17',
    spine_df=spine_df,
    features=[alert_fv, account_fv],
    spine_timestamp_col="TRANSACTION_TIME_UTC",
    spine_label_cols = ["TRANSACTION_FLAG"],
)

In [None]:
## Magic to make it all interesting
training_data_pd = training_data.read.to_pandas()
training_data_pd['WEEK_OF_YEAR'] = training_data_pd['TRANSACTION_TIME_UTC'].dt.isocalendar().week
week_21_transactions = training_data_pd[training_data_pd['WEEK_OF_YEAR'] == 21]
fraudulent_samples = week_21_transactions.sample(n=1000, random_state=42,replace=True) 
fraudulent_samples['TRANSACTION_AMOUNT'] *= np.random.uniform(2, 5, size=len(fraudulent_samples))  # Increase amount by 10% to 50%
training_data_pd = pd.concat([training_data_pd, fraudulent_samples], ignore_index=True)
training_data_pd['IS_FRAUD'] = ((training_data_pd['TRANSACTION_AMOUNT'] > 2000) | (training_data_pd['TRANSACTION_FLAG'] > 30)).astype(int)
training_data_pd['IS_FRAUD'] = 1 - training_data_pd['IS_FRAUD']

filtered_df = training_data_pd[training_data_pd['WEEK_OF_YEAR'].isin([17, 21])]


Pulled a dataset from the feature store on the latest transactions along with account information, alert information, and liklihood of fraud

In [None]:
filtered_df.tail()

### Let's explore the drift in the likelihood of Fraud

In [None]:
fraud_week17 = filtered_df[filtered_df['WEEK_OF_YEAR'] == 17]['IS_FRAUD']
fraud_week21 = filtered_df[filtered_df['WEEK_OF_YEAR'] == 21]['IS_FRAUD']
plt.figure(figsize=(10, 6))
sns.kdeplot(data=fraud_week17, label='Week 17', fill=True, color='blue')
sns.kdeplot(data=fraud_week21, label='Week 21', fill=True, color='red')
plt.title('Kernel Density Estimate of Fraud for Weeks 17 and 21')
plt.xlabel('Likelihood of Fraud')
plt.ylabel('Density')
plt.legend()
plt.show()

In [None]:
def calculate_psi(expected_array, actual_array, buckets=10):
    breakpoints = np.linspace(0, 100, buckets + 1)
    breakpoints = np.percentile(expected_array, breakpoints)

    expected_counts = np.histogram(expected_array, breakpoints)[0]
    actual_counts = np.histogram(actual_array, breakpoints)[0]

    epsilon = 1e-10
    expected_percents = (expected_counts+epsilon) / expected_counts.sum()
    actual_percents = (actual_counts+epsilon) / actual_counts.sum()

    psi_values = (actual_percents - expected_percents) * np.log(actual_percents / expected_percents)
    psi_values = np.where(np.isnan(psi_values), 0, psi_values)  # Replace NaNs with 0

    total_psi = np.sum(psi_values)
    return total_psi

In [None]:
psi = calculate_psi(fraud_week17,fraud_week21)
print(f"Fraud PSI: {psi}")

## Lets investigate some of the features

In [None]:
trans_amount17 = filtered_df[filtered_df['WEEK_OF_YEAR'] == 17]['TRANSACTION_AMOUNT']
trans_amount21 = filtered_df[filtered_df['WEEK_OF_YEAR'] == 21]['TRANSACTION_AMOUNT']
plt.figure(figsize=(10, 6))
sns.kdeplot(data=trans_amount17, label='Week 17', fill=True, color='blue')
sns.kdeplot(data=trans_amount21, label='Week 21', fill=True, color='red')
plt.title('Kernel Density Estimate of TRANSACTION_AMOUNT for Weeks 17 and 21')
plt.xlabel('TRANSACTION_AMOUNT')
plt.ylabel('Density')
plt.legend()
plt.show()

psi = calculate_psi(filtered_df[filtered_df['WEEK_OF_YEAR'] == 17]['TRANSACTION_AMOUNT'], filtered_df[filtered_df['WEEK_OF_YEAR'] == 21]['TRANSACTION_AMOUNT'])
print(f"PSI: {psi}")

In [None]:
plt.figure(figsize=(10, 6))
alert_week17 = filtered_df[filtered_df['WEEK_OF_YEAR'] == 17]['AVG60MIN_ALERT_MM_H']
alert_week21 = filtered_df[filtered_df['WEEK_OF_YEAR'] == 21]['AVG60MIN_ALERT_MM_H']
sns.kdeplot(data=alert_week17, label='Week 17', fill=True, color='blue')
sns.kdeplot(data=alert_week21, label='Week 21', fill=True, color='red')
plt.title('Kernel Density Estimate of 60 Minute Average for Alerts for Weeks 17 and 21')
plt.xlabel('Averaging alerts over 60 minute window for a merchant')
plt.ylabel('Density')
plt.legend()
plt.show()

psi = calculate_psi(filtered_df[filtered_df['WEEK_OF_YEAR'] == 17]['AVG60MIN_ALERT_MM_H'], filtered_df[filtered_df['WEEK_OF_YEAR'] == 21]['AVG60MIN_ALERT_MM_H'])
print(f"PSI: {psi}")

### Let's try an open source package - evidently

In [None]:
from evidently import ColumnMapping

from evidently.report import Report
from evidently.metrics.base_metric import generate_column_metrics
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
from evidently.metrics import *

from evidently.test_suite import TestSuite
from evidently.tests.base_test import generate_column_tests
from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset
from evidently.tests import *

In [None]:
week17 = filtered_df[filtered_df['WEEK_OF_YEAR'] == 17][['TRANSACTION_AMOUNT','TRANSACTION_FLAG','AVG60MIN_ALERT_MM_H']]
week21 = filtered_df[filtered_df['WEEK_OF_YEAR'] == 21][['TRANSACTION_AMOUNT','TRANSACTION_FLAG','AVG60MIN_ALERT_MM_H']]

In [None]:
report = Report(metrics=[
    DataDriftPreset(), 
])

report.run(reference_data=week17, current_data=week21)

In [None]:
report.as_dict()