In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# FraudFinder - Inidividuals Environment Setup 

<table align="left">
  <td>
    <a href="https://console.cloud.google.com/ai-platform/notebooks/deploy-notebook?download_url=https://github.com/GoogleCloudPlatform/fraudfinder/raw/main/00_environment_setup.ipynb">
       <img src="https://www.gstatic.com/cloud/images/navigation/vertex-ai.svg" alt="Google Cloud Notebooks">Open in Cloud Notebook
    </a>
  </td> 
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/fraudfinder/blob/main/00_environment_setup.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Open in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/fraudfinder/blob/main/00_environment_setup.ipynb">
        <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
</table>

## Overview

[FraudFinder](https://github.com/googlecloudplatform/fraudfinder) is a series of labs on how to build a real-time fraud detection system on Google Cloud. Throughout the FraudFinder labs, you will learn how to read historical bank transaction data stored in data warehouse, read from a live stream of new transactions, perform exploratory data analysis (EDA), do feature engineering, ingest features into a feature store, train a model using feature store, register your model in a model registry, evaluate your model, deploy your model to an endpoint, do real-time inference on your model with feature store, and monitor your model.

### Objective

In this notebook, you will setup your environment for Fraudfinder to be used in subsequent labs.

This lab uses the following Google Cloud services and resources:

- [Vertex AI](https://cloud.google.com/vertex-ai/)
- [BigQuery](https://cloud.google.com/bigquery/)
- [Google Cloud Storage](https://cloud.google.com/storage)
- [Pub/Sub](https://cloud.google.com/pubsub/)

Steps performed in this notebook:

- Install required libs
- Setup Parameter files
- Setup GCS buckets

### Install additional packages

Install the following packages required to execute this notebook.

In [None]:
!pip3 install --upgrade pip

In [None]:
!pip install --upgrade -q -r 'requirements.txt'

After you install the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>


### Setup your environment

Run the next cells to import libraries used in this notebook and configure some options.

Run the next cell to set your project ID and some of the other constants used in the lab.  

### Replace REGION Below ***

<div class="alert alert-block alert-info">
<b>NOTE: Please make sure REGION is set correct, or replace accordingly.<br>
    You will need to run from this cell and below after Kernel restart
   </b>
</div>


In [None]:
import random
import string
from typing import Union

import pandas as pd
from google.cloud import bigquery

# Generate unique ID to help w/ unique naming of certain pieces
ID = "".join(random.choices(string.ascii_lowercase + string.digits, k=5))

# Replace Region here
REGION = "us-central1"

# static parameters
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = f"{PROJECT_ID}-fraudfinder"
STAGING_BUCKET = f"{PROJECT_ID}-staging-{ID}"
TRAINING_DS_SIZE = 1000

with open("id_file.txt", "w") as f:
  f.write(ID)

print(f"ID '{ID}' has been saved to id_file.txt")

Read the ID file and set value. This step is just to ensure every participants will have unique buckets and configs across the labs

In [None]:
try:
  with open("id_file.txt", "r") as f:
    ID = f.read().strip()
  print(f"Using ID '{ID}' from id_file.txt")
except FileNotFoundError:
  print("id_file.txt not found. Please make sure the file exists.")
  ID = None 

### Create a Google Cloud Storage bucket and save the config data.

Next, we will create a Google Cloud Storage bucket and will save the config data in this bucket. After the cell operation finishes, you can navigate to [Google Cloud Storage](https://console.cloud.google.com/storage/) to see the GCS bucket. 

In [None]:
config = f"""
BUCKET_NAME          = \"{BUCKET_NAME}\"
STAGING_BUCKET       = \"{STAGING_BUCKET}\"
PROJECT              = \"{PROJECT_ID}\"
REGION               = \"{REGION}\"
ID                   = \"{ID}\"
FEATURESTORE_ID      = \"fraudfinder_{ID}\"
MODEL_NAME           = \"ff_model\"
ENDPOINT_NAME        = \"ff_model_endpoint\"
TRAINING_DS_SIZE     = \"{TRAINING_DS_SIZE}\"
DATA_DIR             = "data"
TRAIN_DATA_DIR       = "train"
CUSTOMER_ENTITY      = "customer"
TERMINAL_ENTITY      = "terminal"
TARGET               = "tx_fraud"
"""

!gsutil mb -l {REGION} gs://{BUCKET_NAME}
!gsutil mb -l {REGION} gs://{STAGING_BUCKET}


!echo '{config}' | gsutil cp - gs://{BUCKET_NAME}/config/notebook_env_{ID}.py
#!echo '{config}' | gsutil cp - gs://{BUCKET_NAME}/config/notebook_env_v02.py

In [None]:
# Gets the default BUCKET_URI and SERVICE_ACCOUNT if they were not specified by the user.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this default Service Account:", SERVICE_ACCOUNT)

In [None]:
config = f"""
PROJECT_ID: {PROJECT_ID}
STAGING_BUCKET: {STAGING_BUCKET}
BUCKET_NAME: {BUCKET_NAME}
REGION: {REGION}
ID: {ID}
CUSTOMER_ENTITY_ID: customer
CUSTOMER_ENTITY_ID_FIELD: customer_id
TERMINAL_ENTITY_ID: terminal
TERMINALS_ENTITY_ID_FIELD: terminal_id
FEATURESTORE_ID: "fraudfinder_{ID}"
FEATUREVIEW_ID: "fraudfinder_view_{ID}"
NETWORK: fraud-finder-network
SUBNET: https://www.googleapis.com/compute/v1/projects/fraud-finder-lab/regions/us-central1/subnetworks/us-central1
MODEL_REGISTRY: ff_model
RAW_BQ_TRANSACTION_TABLE_URI: "{PROJECT_ID}.tx.tx"
RAW_BQ_LABELS_TABLE_URI: "{PROJECT_ID}.tx.txlabels"
FEATURES_BQ_TABLE_URI: "{PROJECT_ID}.tx.wide_features_table"
FEATURE_TIME: feature_ts
ONLINE_STORAGE_NODES: 1
SUBSCRIPTION_NAME: ff-tx-for-feat-eng-sub
SUBSCRIPTION_PATH: "projects/{PROJECT_ID}/subscriptions/ff-tx-for-feat-eng-sub"
DROP_COLUMNS:
- timestamp
- entity_type_customer
- entity_type_terminal
FEAT_COLUMNS:
- customer_id_avg_amount_14day_window
- customer_id_avg_amount_15min_window
- customer_id_avg_amount_1day_window
- customer_id_avg_amount_30min_window
- customer_id_avg_amount_60min_window
- customer_id_avg_amount_7day_window
- customer_id_nb_tx_14day_window
- customer_id_nb_tx_15min_window
- customer_id_nb_tx_1day_window
- customer_id_nb_tx_30min_window
- customer_id_nb_tx_60min_window
- customer_id_nb_tx_7day_window
- terminal_id_avg_amount_15min_window
- terminal_id_avg_amount_30min_window
- terminal_id_avg_amount_60min_window
- terminal_id_nb_tx_14day_window
- terminal_id_nb_tx_15min_window
- terminal_id_nb_tx_1day_window
- terminal_id_nb_tx_30min_window
- terminal_id_nb_tx_60min_window
- terminal_id_nb_tx_7day_window
- terminal_id_risk_14day_window
- terminal_id_risk_1day_window
- terminal_id_risk_7day_window
- tx_amount
TARGET_COLUMN: tx_fraud
DATA_SCHEMA:
  timestamp: object
  tx_amount: float64
  tx_fraud: Int64
  entity_type_customer: Int64
  customer_id_nb_tx_1day_window: Int64
  customer_id_nb_tx_7day_window: Int64
  customer_id_nb_tx_14day_window: Int64
  customer_id_avg_amount_1day_window: float64
  customer_id_avg_amount_7day_window: float64
  customer_id_avg_amount_14day_window: float64
  customer_id_nb_tx_15min_window: Int64
  customer_id_avg_amount_15min_window: float64
  customer_id_nb_tx_30min_window: Int64
  customer_id_avg_amount_30min_window: float64
  customer_id_nb_tx_60min_window: Int64
  customer_id_avg_amount_60min_window: float64
  entity_type_terminal: Int64
  terminal_id_nb_tx_1day_window: Int64
  terminal_id_nb_tx_7day_window: Int64
  terminal_id_nb_tx_14day_window: Int64
  terminal_id_risk_1day_window: float64
  terminal_id_risk_7day_window: float64
  terminal_id_risk_14day_window: float64
  terminal_id_nb_tx_15min_window: Int64
  terminal_id_avg_amount_15min_window: float64
  terminal_id_nb_tx_30min_window: Int64
  terminal_id_avg_amount_30min_window: float64
  terminal_id_nb_tx_60min_window: Int64
  terminal_id_avg_amount_60min_window: float64
MODEL_NAME: ff_model
EXPERIMENT_NAME: ff-experiment-{ID}
DATA_URI: gs://{PROJECT_ID}-fraudfinder/data
TRAIN_DATA_URI: gs://{PROJECT_ID}-fraudfinder/data/train
READ_INSTANCES_TABLE: ground_truth_{ID}
READ_INSTANCES_URI: bq://{PROJECT_ID}.tx.ground_truth_{ID}
DATASET_NAME: fraud_finder_dataset_{ID}
JOB_NAME: fraudfinder-train-xgb-{ID}
ENDPOINT_NAME: ff_model_endpoint
MODEL_SERVING_IMAGE_URI: "us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-7:latest"
IMAGE_REPOSITORY: fraudfinder-{ID}
IMAGE_NAME: dask-xgb-classificator
IMAGE_TAG: latest
IMAGE_URI: us-central1-docker.pkg.dev/{PROJECT_ID}/fraudfinder-{ID}/dask-xgb-classificator:latest
TRAIN_COMPUTE: e2-standard-4
DEPLOY_COMPUTE: n1-standard-4
BASE_IMAGE: "python:3.10"
PIPELINE_NAME: "fraud-finder-xgb-pipeline-{ID}"
PIPELINE_ROOT: "gs://{PROJECT_ID}-fraudfinder/pipelines"
BQ_DATASET: tx
METRICS_URI: "gs://{PROJECT_ID}-fraudfinder/deliverables/metrics.json"
AVG_PR_THRESHOLD: 0.2
MODEL_THRESHOLD: 0.5
AVG_PR_CONDITION: avg_pr_condition
PERSISTENT_RESOURCE_ID: ai-takeoff
REPLICA_COUNT: 1
SERVICE_ACCOUNT: "{SERVICE_ACCOUNT}"
"""

!echo '{config}' | gsutil cp - gs://{BUCKET_NAME}/config/vertex_conf_{ID}.yaml