# DEV - PCA

- Overview: try **pca**, kmeans, autoencoder
- Idea: anomalies compare to predicted class
- Thought: but these are alread principal components...

**Prerequisites:**
-  01 - BigQuery - Table Data Source

**Resources:**
-  [BigQuery ML (BQML) Overview](https://cloud.google.com/bigquery-ml/docs/introduction)
-  [Overview of BQML methods and workflows](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-e2e-journey)

**Conceptual Flow & Workflow**


---
## Setup

inputs:

In [1]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'statmike-mlops-349915'

In [2]:
REGION = 'us-central1'
EXPERIMENT = 'pca'
SERIES = '03'

# source data
BQ_PROJECT = PROJECT_ID
BQ_DATASET = 'fraud'
BQ_TABLE = 'fraud_prepped'

# Resources for serving BigQuery Model Exports
TF_DEPLOY_IMAGE = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-3:latest'
XGB_DEPLOY_IMAGE = 'us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.0-82:latest'

# Model Training
VAR_TARGET = 'Class'
VAR_OMIT = 'transaction_id' # add more variables to the string with space delimiters

packages:

In [3]:
from google.cloud import bigquery
from google.cloud import aiplatform
from datetime import datetime
import matplotlib.pyplot as plt

clients:

In [4]:
bq = bigquery.Client()
aiplatform.init(project=PROJECT_ID, location=REGION)

parameters:

In [5]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET = PROJECT_ID
URI = f"gs://{BUCKET}/{SERIES}/{EXPERIMENT}"
RUN_NAME = f'run-{TIMESTAMP}'

BQ_MODEL = f'{SERIES}_{EXPERIMENT}_{TIMESTAMP}'

---
## This Run

In [6]:
print(f'This run with create BQML model: {BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}')
print(f'The Timestamp Is: {TIMESTAMP}')

This run with create BQML model: statmike-mlops-349915.fraud.03_pca_20221004235851
The Timestamp Is: 20221004235851


---
## Train Model

Use BigQuery ML to train multiclass logistic regression model:
- [PCA](https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-pca) with BigQuery ML (BQML)
- This uses the `splits` column that notebook `01` created to subet to the training data
    - not directly used by the `PCA` training but used to subset to the `splits = 'TRAIN'` data for training

In [7]:
query = f"""
CREATE MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`
OPTIONS (
        model_type = 'PCA',
        pca_explained_variance_ratio = 0.90,
        scale_features = TRUE,
        pca_solver = 'AUTO'
    ) AS
SELECT * EXCEPT({','.join(VAR_OMIT.split())}, splits, {VAR_TARGET})
FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
WHERE splits = 'TRAIN'
"""
job = bq.query(query = query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7fabe89f9d90>

In [8]:
(job.ended-job.started).total_seconds()

37.384

In [14]:
feature_info = bq.query(
    query = f"""
        SELECT *
        FROM ML.FEATURE_INFO(MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`)
        """
).to_dataframe()
feature_info

Unnamed: 0,input,min,max,mean,median,stddev,category_count,null_count,dimension
0,Time,0.0,172792.0,94811.116749,85045.0,47493.531641,,0,
1,V1,-56.40751,2.45493,-0.000215,0.023927,1.958173,,0,
2,V2,-72.715728,22.057729,0.000316,0.061607,1.649662,,0,
3,V3,-48.325589,9.382558,-0.000525,0.179859,1.520173,,0,
4,V4,-5.600607,16.875344,0.000695,-0.034889,1.415309,,0,
5,V5,-113.743307,34.801666,-0.001264,-0.059681,1.381807,,0,
6,V6,-26.160506,73.301626,0.002089,-0.281246,1.334135,,0,
7,V7,-43.557242,120.589494,-0.000721,0.034086,1.245465,,0,
8,V8,-73.216718,19.168327,-0.001064,0.024914,1.205023,,0,
9,V9,-13.434066,15.594995,0.001406,-0.045622,1.100121,,0,


In [15]:
training_info = bq.query(
    query = f"""
        SELECT *
        FROM ML.TRAINING_INFO(MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`)
        """
).to_dataframe()
training_info

Unnamed: 0,training_run,iteration,duration_ms
0,0,0,4582


In [13]:
pc_info = bq.query(
    query = f"""
        SELECT *
        FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`)
        """
).to_dataframe()
pc_info

Unnamed: 0,principal_component_id,eigenvalue,explained_variance_ratio,cumulative_explained_variance_ratio
0,0,1.96533,0.065511,0.065511
1,1,1.681695,0.056056,0.121567
2,2,1.045826,0.034861,0.156428
3,3,1.027172,0.034239,0.190667
4,4,1.018633,0.033954,0.224622
5,5,1.01315,0.033772,0.258394
6,6,1.008045,0.033602,0.291995
7,7,1.006076,0.033536,0.325531
8,8,1.004075,0.033469,0.359
9,9,1.003322,0.033444,0.392444


In [12]:
pc = bq.query(
    query = f"""
        SELECT *
        FROM ML.PRINCIPAL_COMPONENTS(MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`)
        """
).to_dataframe()
pc

Unnamed: 0,principal_component_id,feature,numerical_value,categorical_value
0,0,Time,0.031008,[]
1,0,V1,0.174601,[]
2,0,V2,0.395097,[]
3,0,V3,0.143194,[]
4,0,V4,-0.079480,[]
...,...,...,...,...
775,25,V25,0.131626,[]
776,25,V26,-0.010290,[]
777,25,V27,0.068330,[]
778,25,V28,-0.078994,[]


In [16]:
eval = bq.query(
    query = f"""
        SELECT *
        FROM ML.EVALUATE(MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`)
        """
).to_dataframe()
eval

Unnamed: 0,total_explained_variance_ratio
0,0.923359


In [19]:
query = f"""
SELECT *
FROM ML.PREDICT (MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`,(
    SELECT *
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
    WHERE splits = 'TEST')
  )
"""
pred = bq.query(query = query).to_dataframe()

In [20]:
pred.head()

Unnamed: 0,principal_component_1,principal_component_2,principal_component_3,principal_component_4,principal_component_5,principal_component_6,principal_component_7,principal_component_8,principal_component_9,principal_component_10,...,principal_component_20,principal_component_21,principal_component_22,principal_component_23,principal_component_24,principal_component_25,principal_component_26,Class,transaction_id,splits
0,0.302299,0.889004,0.123864,-0.473393,0.284043,1.742441,-0.345646,0.94686,0.215936,-0.713498,...,-0.507086,-1.276067,1.233245,0.012551,-0.816971,-0.87333,-0.701734,0,1259d8f5-8b63-49c1-96e3-a7c4acc11d0d,TEST
1,0.436857,1.283266,0.276381,0.379212,0.268451,0.221368,0.259071,-1.92376,1.301598,-1.073554,...,1.885078,-0.112686,-0.265738,0.963128,-0.41704,0.781357,-0.185404,0,3ee3058b-b63d-4e7e-9962-7355105648b2,TEST
2,0.739754,1.172268,-0.121995,-0.091609,0.690563,0.375325,-1.704535,-0.338556,2.182858,-0.216122,...,0.099748,1.587683,1.185813,-0.971713,1.437205,0.917917,-0.10337,0,e010e7c7-9d24-4eda-a30e-c1088e545017,TEST
3,0.417853,1.334406,0.161835,-0.057766,-0.112682,0.149498,1.114876,-0.586743,0.508492,0.738301,...,-0.660798,-0.077034,0.199453,-2.07877,-0.782757,-0.217211,0.41648,0,56918cff-5554-4f8f-af79-56b2e7003d4b,TEST
4,0.406308,1.65257,-0.186995,0.21227,-0.295063,-1.278854,-0.070468,4.229582,0.597276,-0.12064,...,0.002969,0.653782,-0.27652,-0.451263,0.606637,-0.247198,-0.088362,0,62f1f2cd-5728-46af-97e8-9be1af044dd3,TEST


In [21]:
query = f"""
SELECT *
FROM ML.DETECT_ANOMALIES (
    MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`,
    STRUCT (0.01 AS contamination),
    (SELECT *
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
    WHERE splits = 'TEST')
  )
"""
anomalies = bq.query(query = query).to_dataframe()

In [22]:
anomalies

Unnamed: 0,is_anomaly,mean_squared_error,Time,V1,V2,V3,V4,V5,V6,V7,...,V23,V24,V25,V26,V27,V28,Amount,Class,transaction_id,splits
0,False,0.002686,62606,1.199408,0.352007,0.379645,1.372017,0.291347,0.524919,-0.117555,...,-0.022218,-0.599026,0.258188,0.928721,-0.058988,-0.008856,0.00,0,c0e53877-f271-4973-96cf-c72d63cf3b92,TEST
1,False,0.003266,52510,1.233631,-0.088081,0.643833,0.857344,-0.843014,-0.711580,-0.274563,...,0.109825,0.339983,0.262696,0.189158,-0.028728,0.011067,0.00,0,b30af7b1-82b2-4136-9b27-b763bc2bce42,TEST
2,False,0.007708,48260,1.278551,0.827627,0.019604,2.262017,0.619265,-0.283509,0.555562,...,0.038313,-0.492386,0.497694,-0.278781,-0.024605,0.013593,0.00,0,18107cd9-f423-42e9-a402-575e3bb21d3d,TEST
3,False,0.019406,160340,-2.184687,3.150965,-1.632775,2.778267,0.163796,0.345102,-1.267995,...,0.483926,0.546978,-0.296219,-0.399505,-0.924709,-0.256977,0.00,0,a4bfff59-ecb5-42b4-88f0-7128c27a790f,TEST
4,False,0.028058,142309,-0.278832,1.037789,-0.742498,-0.870529,0.669925,-1.440947,1.078175,...,-0.051528,-0.067947,-0.774851,0.275841,0.031968,0.160919,0.00,0,b48778b2-165b-4a86-993a-ef097dca0a85,TEST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28497,False,0.002200,65924,1.262035,0.354847,-0.135267,0.825899,0.502722,0.107648,0.203793,...,-0.212467,-0.890402,0.785553,-0.240702,0.035282,0.010168,9.98,0,54ce9f2b-8e5c-4a1b-afc9-74784651b6f3,TEST
28498,False,0.004834,31863,-1.172610,1.128343,1.052509,-0.209267,-0.069754,0.289843,0.003819,...,0.074452,-0.333420,-0.413688,0.092441,-0.098158,0.023679,10.98,0,c25aaa97-db42-409b-8fb2-9f290b722527,TEST
28499,False,0.001608,162341,2.174177,-0.184777,-2.424983,-1.678341,0.553527,-1.593798,0.704362,...,-0.020073,0.686903,0.488927,-0.102902,-0.056588,-0.068442,11.98,0,72fa9754-cc59-4080-b50a-a9930b2e2f5c,TEST
28500,False,0.017358,97371,0.197165,0.566166,0.456154,-0.497820,1.352364,-0.018279,1.287845,...,-0.202740,-1.165795,-0.519229,-0.685145,-0.704695,-0.618034,14.48,0,a6408c2e-1fde-47aa-a500-806b4f96cde6,TEST


In [27]:
query = f"""
WITH ANOMALIES AS (
        SELECT is_anomaly, {VAR_TARGET}
        FROM ML.DETECT_ANOMALIES (
            MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`,
            STRUCT (0.01 AS contamination),
            (SELECT *
            FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
            WHERE splits = 'TEST')
          )
      )
SELECT is_anomaly, {VAR_TARGET}, count(*) as count
FROM ANOMALIES
GROUP BY is_anomaly, {VAR_TARGET}
"""
bq.query(query = query).to_dataframe()

Unnamed: 0,is_anomaly,Class,count
0,False,0,28156
1,True,0,299
2,False,1,40
3,True,1,7


In [None]:
# need to set contaimination to expected faud rate?
# need three classes: actual, supervised, anomaly - does anomaly enhance supervised?