In [49]:
import pandas as pd
import evidently
from evidently.report import Report
from evidently.ui.dashboards import CounterAgg
from evidently.ui.dashboards import DashboardPanelCounter
from evidently.ui.dashboards import DashboardPanelPlot
from evidently.metric_preset import DataDriftPreset
from evidently.metric_preset import DataQualityPreset
from evidently.metric_preset import TargetDriftPreset
from evidently.metric_preset import RegressionPreset
from evidently.metric_preset import ClassificationPreset
from evidently.ui.dashboards import PanelValue

from evidently.ui.dashboards import ReportFilter
from evidently.ui.remote import RemoteWorkspace
from evidently.ui.workspace import Workspace
from evidently.ui.workspace import WorkspaceBase
from fraud_detection.data import load_data
from fraud_detection.models import train_model
from fraud_detection.utils import setup_logger
import joblib

In [12]:
# Setting up the logger
logger = setup_logger()

In [13]:
# Load the data
db_path = 'company_database.db'
query_file = 'data.sql'
logger.info("Loading data...")
data = load_data(db_path, query_file, winsorize_data=False)


2024-10-16 00:24:02,240 - fraud_detection - INFO - Loading data...
2024-10-16 00:24:02,240 - fraud_detection - INFO - Loading data...
2024-10-16 00:24:02,240 - fraud_detection - INFO - Loading data...
2024-10-16 00:24:02,285 - fraud_detection - INFO - Loading data from company_database.db with query: with decisions as (
  select
    txn_id,
    accounts_id,
    max(case when decision = 'Fraud' then 1 else 0 end) as is_fraud,
    max(case when decision = 'Legitimate' then 1 else 0 end) as is_legitimate,
    max(is_false_positive) as is_false_positive,
    max(created_at) as max_created_at,
    min(created_at) as min_created_at,
    count(*) as total_decisions
  from
    fraud_decisions
  group by
    txn_id,
    accounts_id
),
txns as (
select
  txn.id as txn_id,
  txn.accounts_id,
  accts.organization_id as org_id,
  txn.created_at as txn_created_at,
   txn.settled_at as txn_settled_at,
   strftime('%H', txn.created_at)::int as txn_created_at_hr,
   (julian(txn.settled_at) - julian(txn

In [6]:
data.head()

Unnamed: 0,txn_id,accounts_id,org_id,txn_created_at,txn_settled_at,txn_created_at_hr,settlement_time_mins,acct_tenure_days,org_tenure_days,txn_status,...,txn_count_30d,avg_txn_amount_30d,high_value_txn_count_30d,org_txn_failure_rate,is_fraud,is_legitimate,is_false_positive,frd_max_created_at,frd_min_created_at,total_decisions
0,txn_584e8e1d-ca48-49a4-b177-06b371950c0a,accts_49438ca4-9618-44a3-889f-bba3f74c361c,org_b71e1a88-24ae-41d2-b46c-cdf5ae011d78,2023-02-05 08:45:00,2023-02-05 08:59:00,8,14.0,11.465134,-160.423295,failed,...,1,2227.0,0.0,0.5,0.0,1.0,0.0,NaT,NaT,4.0
1,txn_798033f4-8081-433a-8cf0-b8ab058cf7bb,accts_c419f92d-af52-4ed7-93f0-4db9ee5ab491,org_c60eddaa-a30f-4c30-b8d2-5fb3d4fc5cec,2023-01-29 14:36:00,2023-01-29 15:34:00,14,58.0,-133.771357,-222.331061,completed,...,0,,,,1.0,0.0,0.0,NaT,NaT,4.0
2,txn_7efd21c2-9c3b-4e1f-8d58-fc390ff1e6c5,accts_164372ba-86ab-4878-aef0-769ab783f40f,org_5fbab4f4-c4f7-47ab-a12e-32f285af126c,2023-09-05 04:10:00,2023-09-05 04:16:00,4,6.0,167.827438,156.173611,failed,...,3,-2829.0,0.0,0.0,0.0,1.0,0.0,2023-09-22 04:08:32.957684,2023-06-26 15:27:46.149586,6.0
3,txn_42987531-2c95-4b47-829a-fb0d005c5254,accts_792fafa0-a4b3-4dc7-bd00-231abebaa186,org_8ee6d2dd-40a0-40e5-99e2-0e751d2b4917,2023-03-30 02:26:00,2023-03-30 02:41:00,2,15.0,68.025851,-0.141035,completed,...,0,,,0.333333,1.0,0.0,0.0,NaT,NaT,5.0
4,txn_4909a774-0d34-4ac4-94fd-1310d7c20ad9,accts_bb486fcf-1c00-46f3-ad97-5a4fef5e7ea6,org_f156a09c-59be-40a8-8b1b-96602e292677,2023-03-05 21:36:00,2023-03-05 22:30:00,21,54.0,48.331216,-123.615152,completed,...,3,-2133.333333,0.0,0.25,0.0,1.0,0.0,NaT,NaT,6.0


In [32]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

X = data.select_dtypes((int, float))
X['target'] = X['is_fraud']
X.drop(columns=['is_fraud'], inplace=True)
y = data['is_fraud'].fillna(0).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
logger.info("Training the model...")
model = train_model('logistic')
model.fit(X_train.drop(columns=['target']), y_train)

# Save the model
joblib.dump(model, 'fraud_detection_model.pkl')
logger.info("Model saved as fraud_detection_model.pkl")

# Generate predictions
y_pred = model.predict(X_test.drop(columns=['target']))
X_train['prediction'] = model.predict(X_train.drop(columns=['target']))
X_test['prediction'] = model.predict(X_test.drop(columns=['target']))


2024-10-16 00:46:29,909 - fraud_detection - INFO - Training the model...
2024-10-16 00:46:29,909 - fraud_detection - INFO - Training the model...
2024-10-16 00:46:29,909 - fraud_detection - INFO - Training the model...
INFO:fraud_detection:Training the model...
2024-10-16 00:46:29,938 - fraud_detection - INFO - Model saved as fraud_detection_model.pkl
2024-10-16 00:46:29,938 - fraud_detection - INFO - Model saved as fraud_detection_model.pkl
2024-10-16 00:46:29,938 - fraud_detection - INFO - Model saved as fraud_detection_model.pkl
INFO:fraud_detection:Model saved as fraud_detection_model.pkl


In [19]:
X_train.head()

Unnamed: 0,txn_created_at_hr,settlement_time_mins,acct_tenure_days,org_tenure_days,txn_amount,benford_ones_digit,accts_is_closed,org_has_website,org_is_active,org_num_users,...,org_num_users_with_email,ratio_users_with_email,users_added_previous_day,txn_count_30d,avg_txn_amount_30d,high_value_txn_count_30d,org_txn_failure_rate,is_legitimate,is_false_positive,total_decisions
968,4,56.0,-41.178065,-62.947601,2202.0,2,0,1,1,1.0,...,1.0,1.0,0.0,2,94.0,1.0,0.0,,,
240,16,16.0,124.56279,-70.173927,-8848.0,8,0,0,1,2.0,...,2.0,1.0,0.0,1,7377.0,1.0,0.0,1.0,0.0,4.0
819,14,36.000001,-98.045829,-110.963258,8958.0,8,1,0,1,2.0,...,1.0,0.5,0.0,2,-4919.0,0.0,0.125,,,
692,15,49.0,140.618537,195.655556,-1182.0,2,0,1,1,2.0,...,2.0,1.0,0.0,4,-1674.25,1.0,0.142857,,,
420,4,36.0,40.246559,64.699179,-1810.0,0,0,1,0,4.0,...,4.0,1.0,0.0,0,,,0.0,1.0,0.0,6.0


In [37]:
report = Report(metrics=[
    DataDriftPreset(),
    DataQualityPreset(),
    TargetDriftPreset(),
    RegressionPreset(),
    ClassificationPreset(),

])

report.run(reference_data=X_train, current_data=X_test)
report.save_html("model_report.html")
logger.info("Model report saved as model_report.html")

# Generate model card
model_card = Report(metrics=[
    DataDriftPreset(),
  ClassificationPreset(),
])
model_card.run(reference_data=X_train, current_data=X_test)
model_card.save_html("model_card.html")
logger.info("Model card saved as model_card.html")


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calcu

In [35]:
from evidently.ui.remote import RemoteWorkspace

In [36]:
workspace = RemoteWorkspace("https://fraud-detection-4q3f.onrender.com")


In [41]:
project = workspace.create_project("fraud_detection_monitoring")
project.description = "example monitoring project"

In [42]:
workspace.add_report(project.id, report)
workspace.add_report(project.id, model_card)

In [46]:
project.dashboard.add_panel(
        DashboardPanelCounter(
            title="Share of Drifted Features",
            filter=ReportFilter(metadata_values={}, tag_values=[]),
            value=PanelValue(
                metric_id="DatasetDriftMetric",
                field_path="share_of_drifted_columns",
                legend="share",
            ),
            text="share",
            agg=CounterAgg.LAST,
            size=1,
        )
    )


In [50]:
project.dashboard.add_panel(
        DashboardPanelPlot(
            title="Data Drift",
            filter=ReportFilter(metadata_values={}, tag_values=[]),
            values=[
                PanelValue(metric_id="DatasetDriftMetric", field_path="share_of_drifted_columns", legend="share"),
            ],
            plot_type="bar",
        )
    )




In [51]:
project.dashboard.add_panel(
        DashboardPanelPlot(
            title="Feature Importance",
            filter=ReportFilter(metadata_values={}, tag_values=[]),
            values=[
                PanelValue(metric_id="ClassificationQualityMetric", field_path="feature_importance", legend="importance"),
            ],
            plot_type="bar",
        )
    )

In [54]:
project.dashboard.add_panel(
        DashboardPanelCounter(
            title="Classification Quality Score",
            filter=ReportFilter(metadata_values={}, tag_values=[]),
            value=PanelValue(
                metric_id="ClassificationQualityMetric",
                field_path="accuracy",
                legend="accuracy",
            ),
            text="accuracy",
            agg=CounterAgg.LAST,
            size=1,
        )
    )

In [56]:
project.dashboard.add_panel(
        DashboardPanelCounter(
            title="Share of Drifted Features",
            filter=ReportFilter(metadata_values={}, tag_values=[]),
            value=PanelValue(
                metric_id="DatasetDriftMetric",
                field_path="share_of_drifted_columns",
                legend="share",
            ),
            text="share",
            agg=CounterAgg.LAST,
            size=1,
        )
    )

In [57]:
project.save()


Project(id=UUID('0192945b-2f05-7e4d-84dc-b175b57a0d75'), name='fraud_detection_monitoring', description='example monitoring project', dashboard=DashboardConfig(name='fraud_detection_monitoring', panels=[DashboardPanelCounter(type='evidently:dashboard_panel:DashboardPanelCounter', id=UUID('0192945f-a415-7d64-b0ba-27b498ee047f'), title='Share of Drifted Features', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.HALF: 1>, agg=<CounterAgg.LAST: 'last'>, value=PanelValue(field_path='share_of_drifted_columns', metric_id='DatasetDriftMetric', metric_fingerprint=None, metric_args={}, legend='share'), text='share'), DashboardPanelCounter(type='evidently:dashboard_panel:DashboardPanelCounter', id=UUID('01929461-73ad-72bb-8f62-e9d5239422a9'), title='Share of Drifted Features', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.HALF: 1>, agg=<CounterAgg.LAST: 'last'>, value=PanelValue(field_path='s