# GDS For Snowflake Demo: Entity Resolution & ML

## Setup
make sure to install these packages using the snowflake packages dropdown
- `snowflake-ml-python`
- `sklearn`
- `matplotlib`
- `seaborn`
- `streamlit`

In [None]:
# Snowpark for Python
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION
from snowflake.snowpark.functions import udf
import snowflake.snowpark.functions as F

# Snowpark ML
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.model_selection import GridSearchCV
from snowflake.ml.registry import Registry
from snowflake.ml._internal.utils import identifier

# data science libs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from snowflake.ml.modeling.metrics import confusion_matrix

# misc
import json
import joblib
import cachetools
import streamlit as st
import plotly
import dash_cytoscape

# warning suppresion
import warnings; warnings.simplefilter('ignore')


# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

## Combining Graph Features & Additional Fraud Risk Labeling

We will now combine GDS and original features into one table for downstream ML learning.  We will create a new Fraud_Risk label that is 1 for all nodes in a WCC cvommunity with flagged user account.  This will hel,p extend our known fraud labels.

In [None]:
CREATE OR REPLACE VIEW user_features AS
SELECT p2p_users.* RENAME nodeId AS user_id,
    gds_features.wcc_id,
    gds_features.has_fraud_flag AS fraud_risk,
    gds_features.user_count AS community_size,
    TO_NUMBER(gds_features.user_count > 1) AS part_of_community,
    gds_features.entity_link_pagerank,
    gds_features.transaction_pagerank
-- join users to gds features
FROM p2p_users JOIN (
    -- join resolved user view with gds algo results on wcc_id
    SELECT resolved_p2p_users.wcc_id, 
        resolved_p2p_users.user_count, 
        resolved_p2p_users.has_fraud_flag,
        gds_algo_results.entity_link_pagerank,
        gds_algo_results.transaction_pagerank,
        gds_algo_results.nodeId
    FROM resolved_p2p_users JOIN (
        -- join gds output tables on node
        SELECT p2p_components.nodeId, 
            p2p_components.wcc_id, 
            pagerank.entity_link_pagerank,
            pagerank.transaction_pagerank
        FROM p2p_components 
        JOIN (
            SELECT p2p_entity_link_pagerank.nodeId, 
                p2p_entity_link_pagerank.score AS entity_link_pagerank,
                p2p_transaction_pagerank.score AS transaction_pagerank
            FROM p2p_entity_link_pagerank 
            JOIN p2p_transaction_pagerank ON p2p_transaction_pagerank.nodeId=p2p_entity_link_pagerank.nodeId
        ) pagerank ON p2p_components.nodeId = pagerank.nodeId
    ) gds_algo_results ON gds_algo_results.wcc_id = resolved_p2p_users.wcc_id
) gds_features ON user_id = gds_features.nodeId;
SELECT * FROM user_features

### Additional Fraud Risk Labels from WCC Entity Resolution

In [None]:
user_feat_df = feature_table.to_pandas()
flagged_num = user_feat_df.FRAUD_TRANSFER_FLAG.sum()
fraud_risk_num = user_feat_df.FRAUD_RISK.sum()
print(f'# of original flagged accounts: {flagged_num:.0f}')
print(f'# of newly labeled fraud risk accounts from WCC: {fraud_risk_num:.0f}')
print(f'{(fraud_risk_num - flagged_num):.0f} new accounts identified, a {100*(fraud_risk_num - flagged_num)/flagged_num:.1f}% increase')

## Use Graph Features to Improve ML Model
Below we compare a baseline fraud classifier (not using new graph features, against one that uses graph features.  We will see a significant lift for AUC and PR-AUC. 

In [None]:
# Categorize all the features for modeling
NUMERICAL_COLUMNS = ['IP_COUNT', 'CARD_COUNT', 'DEVICE_COUNT']
GRAPH_NUMERICAL_COLUMNS = ['COMMUNITY_SIZE', 'PART_OF_COMMUNITY', 'ENTITY_LINK_PAGERANK', 'TRANSACTION_PAGERANK']

LABEL_COLUMN = 'FRAUD_RISK'

In [None]:
from sklearn.model_selection import train_test_split

X_graph = user_feat_df[NUMERICAL_COLUMNS + GRAPH_NUMERICAL_COLUMNS]
y = user_feat_df.FRAUD_RISK

X_graph_train, X_graph_test, y_train, y_test = train_test_split(X_graph, y, test_size=0.2, random_state=0)

X_baseline_train = X_graph_train[NUMERICAL_COLUMNS]
X_baseline_test = X_graph_test[NUMERICAL_COLUMNS]

In [None]:
y

## Baseline Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_baseline = RandomForestClassifier(n_estimators=500, random_state=0, max_depth=5, bootstrap=True, class_weight='balanced')
clf_baseline.fit(X_baseline_train, y_train)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
print('Accuracy of baseline classifier on test set: {:.2f}'.format(clf_baseline.score(X_baseline_test, y_test)))
print('\nConfusion Matrix: ')
disp = ConfusionMatrixDisplay.from_predictions(y_test, 
                                               clf_baseline.predict(X_baseline_test), 
                                               display_labels=clf_baseline.classes_,
                                               normalize='true', cmap='Greys', colorbar=False)

In [None]:
from sklearn.metrics import RocCurveDisplay

display = RocCurveDisplay.from_estimator(clf_baseline, X_baseline_test, y_test, name="Baseline Model")
_ = display.ax_.set_title("ROC Curve")

In [None]:
from sklearn.metrics import PrecisionRecallDisplay

y_baseline_prob = clf_baseline.predict_proba(X_baseline_test)
display = PrecisionRecallDisplay.from_predictions(y_test, y_baseline_prob [:, 1], name="Baseline Model")
_ = display.ax_.set_title("Precision-Recall Curve")

# Graph Feature Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_graph = RandomForestClassifier(n_estimators=500, random_state=0, max_depth=5, bootstrap=True, class_weight='balanced')
clf_graph.fit(X_graph_train, y_train)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
print('Accuracy of classifier witn graph features on test set: {:.2f}'.format(clf_graph.score(X_graph_test, y_test)))
print('\nConfusion Matrix: ')
disp = ConfusionMatrixDisplay.from_predictions(y_test, 
                                               clf_graph.predict(X_graph_test), 
                                               display_labels=clf_graph.classes_,
                                               normalize='true', cmap='Greys', colorbar=False)

In [None]:
from sklearn.metrics import RocCurveDisplay

display = RocCurveDisplay.from_estimator(clf_graph, X_graph_test, y_test, name="Graph Feature Model")
_ = display.ax_.set_title("ROC Curve")

In [None]:
from sklearn.metrics import PrecisionRecallDisplay

y_graph_prob = clf_graph.predict_proba(X_graph_test)
display = PrecisionRecallDisplay.from_predictions(y_test, y_graph_prob [:, 1], name="Graph Model")
_ = display.ax_.set_title("Precision-Recall Curve")