## Import packages

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install sdmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sdmetrics
  Downloading sdmetrics-0.8.0-py2.py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 5.0 MB/s 
Collecting plotly<6,>=5.10.0
  Downloading plotly-5.11.0-py2.py3-none-any.whl (15.3 MB)
[K     |████████████████████████████████| 15.3 MB 54.5 MB/s 
[?25hCollecting copulas<0.8,>=0.7.0
  Downloading copulas-0.7.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.7 MB/s 
[?25hCollecting matplotlib<4,>=3.4.0
  Downloading matplotlib-3.6.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 44.3 MB/s 
Collecting fonttools>=4.22.0
  Downloading fonttools-4.38.0-py3-none-any.whl (965 kB)
[K     |████████████████████████████████| 965 kB 59.6 MB/s 
Collecting contourpy>=1.0.1
  Downloading contourpy-1.0.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [3]:
import pandas as pd
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.timeseries import LSTMDetection

In [36]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from xgboost import cv
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import log_loss



In [40]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

In [54]:
from sklearn.linear_model import LogisticRegression

## Load data

In [87]:
real_data = pd.read_csv('/content/drive/Shareddrives/SI 670 Final_Proj/synthetic_samples/real_data.csv')

baseline_syn = pd.read_csv('/content/drive/Shareddrives/SI 670 Final_Proj/synthetic_samples/baseline_syn_data.csv')
ctgan_syn = pd.read_csv('/content/drive/Shareddrives/SI 670 Final_Proj/synthetic_samples/ctgan_syn_data.csv')
hma_syn = pd.read_csv('/content/drive/Shareddrives/SI 670 Final_Proj/synthetic_samples/hma_syn_data.csv')
par_syn = pd.read_csv('/content/drive/Shareddrives/SI 670 Final_Proj/synthetic_samples/par_syn_data.csv')
spop_syn = pd.read_csv('/content/drive/Shareddrives/SI 670 Final_Proj/synthetic_samples/spop_syn_data.csv')
tvae_syn = pd.read_csv('/content/drive/Shareddrives/SI 670 Final_Proj/synthetic_samples/tvae_syn_data.csv')

In [88]:
def concat(syn_data, real_data):
  syn_data['LABEL'] = 0
  real_data['LABEL'] = 1
  concat_df = pd.concat([syn_data, real_data], ignore_index=True)
  concat_df = concat_df.sample(frac = 1)
  concat_df['TRANSACTION_DATE'] = pd.to_datetime(concat_df['TRANSACTION_DATE'])
  real_data.drop('LABEL', axis=1, inplace=True)
  syn_data.drop('LABEL', axis=1, inplace=True)
  return concat_df

In [89]:
baseline = concat(baseline_syn, real_data)
ctgan = concat(ctgan_syn, real_data)
hma = concat(hma_syn, real_data)
par = concat(par_syn, real_data)
spop = concat(spop_syn, real_data)
tvae = concat(tvae_syn, real_data)

## Detection Metrics

In [65]:
def detect_metrics(data, model_name): 
  X = data[['AGENCY', 'TRANSACTION_AMOUNT', 'TRANSACTION_DATE']]
  y = data['LABEL']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

  numeric_features = ['TRANSACTION_AMOUNT']
  categorical_features = ['AGENCY']

  preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(), categorical_features),
    ])
  
  pipe_xgb = Pipeline([('scaler', preprocessor), ('model', XGBClassifier(seed=0))])
  pipe_lr = Pipeline([('scaler', preprocessor), ('model', LogisticRegression(random_state=0, max_iter=10000))])

  # param_grid = {"model__learning_rate": [0.1, 1, 10],
  #             "model__n_estimators": [5, 10, 100],
  #             "model__max_depth": [3, 5]}
  param_grid = {}
  clf_xgb = GridSearchCV(pipe_xgb, param_grid, cv = 5)
  clf_xgb.fit(X_train, y_train)
  y_pred_test_xgb = clf_xgb.predict(X_test)
  acc_test_xgb = accuracy_score(y_test, y_pred_test_xgb)

  clf_lr = GridSearchCV(pipe_lr, param_grid, cv = 5)
  clf_lr.fit(X_train, y_train)
  y_pred_test_lr = clf_lr.predict(X_test)
  acc_test_lr = accuracy_score(y_test, y_pred_test_lr)
  return model_name, acc_test_lr, acc_test_xgb 

In [63]:
model_names = ['Baseline', 'CTGAN', 'HMA', 'PAR', 'Synthpop', 'TVAE']
dfs = [baseline, ctgan, hma, par, spop, tvae]

In [67]:
for model_name, data in zip(model_names, dfs):
  model_name, acc_test_lr, acc_test_xgb = detect_metrics(data, model_name)
  print("Model: {}, Logistic regression accuracy: {}, XGBoost accuracy: {}".format(model_name, round(acc_test_lr,4), round(acc_test_xgb,4)))


Model: Baseline, Logistic regression accuracy: 0.4939, XGBoost accuracy: 0.5906
Model: CTGAN, Logistic regression accuracy: 0.5269, XGBoost accuracy: 0.6611
Model: HMA, Logistic regression accuracy: 0.8425, XGBoost accuracy: 0.8401
Model: PAR, Logistic regression accuracy: 0.5001, XGBoost accuracy: 0.7246
Model: Synthpop, Logistic regression accuracy: 0.4994, XGBoost accuracy: 0.4859
Model: TVAE, Logistic regression accuracy: 0.7572, XGBoost accuracy: 0.756


## Statistical properties

In [68]:
# https://docs.sdv.dev/sdmetrics/getting-started/metadata/single-table-metadata
metadata = {
    "fields": {
        "TRANSACTION_DATE":{
            "type": "datetime",
            "format": "%Y-%m-%d"
        },
        "TRANSACTION_AMOUNT": {
            "type": "numerical",
            "subtype": "float"
        },
        "AGENCY": {
            "type": "categorical",
            "subtype": "string",
        },
    }
}

In [85]:
report_baseline = QualityReport()
report_baseline.generate(real_data, baseline_syn, metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 26.54it/s]



Overall Quality Score: 96.2%

Properties:
Column Shapes: 96.81%
Column Pair Trends: 95.59%


In [90]:
report_ctgan = QualityReport()
report_ctgan.generate(real_data, ctgan_syn, metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 30.07it/s]



Overall Quality Score: 91.92%

Properties:
Column Shapes: 91.65%
Column Pair Trends: 92.18%


In [91]:
report_hma = QualityReport()
report_hma.generate(real_data, hma_syn, metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 25.87it/s]



Overall Quality Score: 62.26%

Properties:
Column Shapes: 59.79%
Column Pair Trends: 64.72%


In [92]:
report_par = QualityReport()
report_par.generate(real_data, par_syn, metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 27.25it/s]



Overall Quality Score: 94.05%

Properties:
Column Shapes: 92.81%
Column Pair Trends: 95.3%


In [93]:
report_spop = QualityReport()
report_spop.generate(real_data, spop_syn, metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 33.60it/s]



Overall Quality Score: 98.5%

Properties:
Column Shapes: 98.85%
Column Pair Trends: 98.15%


In [94]:
report_tvae = QualityReport()
report_tvae.generate(real_data, tvae_syn, metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 33.97it/s]



Overall Quality Score: 70.3%

Properties:
Column Shapes: 66.0%
Column Pair Trends: 74.59%


In [95]:
report_baseline.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,TRANSACTION_AMOUNT,KSComplement,0.907526
1,TRANSACTION_DATE,KSComplement,0.994581
2,AGENCY,TVComplement,0.985125


In [96]:
report_ctgan.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,TRANSACTION_AMOUNT,KSComplement,0.809598
1,TRANSACTION_DATE,KSComplement,0.892793
2,AGENCY,TVComplement,0.981796


In [97]:
report_hma.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,TRANSACTION_AMOUNT,KSComplement,0.816305
1,TRANSACTION_DATE,KSComplement,0.925453
2,AGENCY,TVComplement,0.324944


In [98]:
report_par.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,TRANSACTION_AMOUNT,KSComplement,0.793377
1,TRANSACTION_DATE,KSComplement,0.951443
2,AGENCY,TVComplement,0.983708


In [102]:
report_spop.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,TRANSACTION_AMOUNT,KSComplement,0.9932
1,TRANSACTION_DATE,KSComplement,0.997202
2,AGENCY,TVComplement,0.98176


In [103]:
report_tvae.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Quality Score
0,TRANSACTION_AMOUNT,KSComplement,0.726155
1,TRANSACTION_DATE,KSComplement,0.796317
2,AGENCY,TVComplement,0.558845


In [99]:
report_baseline.get_visualization(property_name='Column Shapes')

In [100]:
report_baseline.get_visualization(property_name='Column Pair Trends')