### Imports

In [None]:
import os, sys

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf, numpy as np, pandas as pd, seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from tensorflow.keras import layers, losses, models as tf_models, activations, optimizers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from imblearn.over_sampling import SMOTE
from sklearn import feature_selection as skfs

if not os.path.exists("plasma"):
    !git clone https://github.com/sequenzia/plasma.git

import plasma
from plasma import utils

### Helper Functions

In [4]:
def plot_corr_fn(data):
    plt.figure(figsize=(25,14))
    sns.heatmap(data.corr(method='spearman'),annot=True,cmap='YlGnBu')
    plt.show()

def corr_output_fn(data,cols):
    data_corr = data[cols].corr(method='spearman')
    data_corr_dict = data_corr.to_dict()

    corr_map = {}

    for k, v in data_corr_dict.items():
        for k2, v2 in v.items():
            if k2 != k:
                if (k2,k) not in corr_map:
                    corr_map[(k,k2)] = v2
                    
    corr_map = sorted(corr_map.items(), reverse=True, key=lambda kv: kv[1])

    plot_corr_fn(data[cols])

    print('\nTop 20 Correlated Features\n')
    for k, v in corr_map[:20]:
        print(f'{k}: {v:.7f}')

def compare_means(data_1,data_2,alpha,test_type='t'):
    """ 
    t-test or z-test of means

    H0 (null): data_1.mean == data_2.mean
    HA (alertantive): data_1.mean > data_2.mean 

    p-val < alpha reject null

    cl = (1-pval)*100

    returns data_1_stats,
            data_2_stats,
            tstats,
            pvals,
            pvals-alpha,
            cls,

    """

    data_1_stats = sm.stats.DescrStatsW(data_1)
    data_2_stats = sm.stats.DescrStatsW(data_2)

    compare = sm.stats.CompareMeans(data_1_stats,data_2_stats)

    if test_type == 'z':
        tstats, pvals = compare.ztest_ind('larger')
    else:
        tstats, pvals, ddf = compare.ttest_ind('larger')

    return {"data_1_stats":data_1_stats,
            "data_2_stats":data_2_stats,
            "compare": compare,
            "tstats": tstats,
            "pvals": pvals,
            "pvals_alpha":pvals-alpha,
            "cls": (1-pvals)*100}

def features_anova_test():
    test_stats = skfs.f_classif(data[x_cols],data[y_cols[0]])
    p_val = .05

    selected_features = {'features': [], 'test_vals':[], 'p_vals':[]}
    for i, col in enumerate(x_cols):

        cur_test_val = test_stats[0][i]
        cur_p_val = test_stats[1][i]

        print(f"Feature: {col} | test_stat = {cur_test_val:.2f} p_val = {cur_p_val:.5f}")

        if cur_p_val < p_val:
            print(f"Reject Null Hypothesis")
        else:
            print(f"Fail to reject Null Hypothesis")
            selected_features['features'].append(col)
            selected_features['test_vals'].append(cur_test_val)
            selected_features['p_vals'].append(cur_p_val)

        print("\n")

    print(selected_features)

def model_metrics(y_true, y_hat, model_type=None):

    if model_type:
        print(f"---------------------- {model_type} Model ----------------------\n")

    target_names = ["NOT FRAUD", "FRAUD"]    

    accuracy = accuracy_score(y_true, y_hat)
    precision = precision_score(y_true, y_hat, zero_division=1)
    recall = recall_score(y_true, y_hat, zero_division=1)

    cls_report = classification_report(y_true, y_hat, digits=5, target_names=target_names, output_dict=False, zero_division=1)

    cm = confusion_matrix(y_true, y_hat)
    cm_ravel = cm.ravel()
    cm_groups = {'TN':cm_ravel[0], 'FP':cm_ravel[1], 'FN':cm_ravel[2], 'TP':cm_ravel[3]}

    print(f"{cls_report}")
    # print("\n")
    # print(f"Accuracy: {accuracy:.5f}")
    # print(f"Precision: {precision:.5f}")
    # print(f"Recall: {recall:.5f}")
    # print("\n")  
    print(cm_groups,"\n")
    cm_disp = ConfusionMatrixDisplay(cm,display_labels=target_names)
    fig = cm_disp.plot()
    plt.show()
    print("\n")

    return {"accuracy": accuracy, "precision":precision, "recall":recall, "cls_reprt": cls_report, "cm":cm, "cm_groups": cm_groups}



### Setup

In [3]:
utils.set_options(5)
random_state = 42

app = plasma.App()
data = app.datasets[0]

### Features

In [5]:
x_cols = list(data.columns[:-1])
y_cols = list(data.columns[-1:])

In [6]:
selector_anova_tp = skfs.SelectPercentile(skfs.f_classif, percentile=25)
selected_features = x_cols
# list(selector_anova_tp.fit(data[x_cols],data[y_cols[0]]).get_feature_names_out())
print(f"Selected Features: {selected_features}")

Selected Features: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']


#### Unbalanced Classes

In [7]:
smote = SMOTE(sampling_strategy='auto')
syn_data = pd.concat(smote.fit_resample(data[selected_features],data[y_cols]), axis=1)

### Split into Train, Val, Test

In [8]:
print("------ Main Data ------")
train_data, val_data, test_data  = utils.preprocess_data(data,
                                                         cols=[selected_features,y_cols],
                                                         split_config=[.8, .1, .1],
                                                         pos_split_config=None,
                                                         pre_shuffle=False,
                                                         to_numpy=False,
                                                         random_state=random_state,
                                                         debug_on=True)
print("\n")

print("------ Positives All Test Data ------")
train_data_pos, val_data_pos, test_data_pos  = utils.preprocess_data(data,
                                                                     cols=[selected_features,y_cols],
                                                                     split_config=[.8, .1, .1],
                                                                     pos_split_config=[0, 0, .1],
                                                                     pre_shuffle=False,
                                                                     to_numpy=False,
                                                                     random_state=random_state,
                                                                     debug_on=True)
print("\n")

print("------ SMOTE Syn Data ------")
train_syn_data, val_syn_data, test_syn_data = utils.preprocess_data(syn_data,
                                                                    cols=[selected_features,y_cols],
                                                                    split_config=[.8, .1, .1],
                                                                    pos_split_config=None,
                                                                    pre_shuffle=False,
                                                                    to_numpy=False,
                                                                    random_state=random_state,
                                                                    debug_on=True)

# print("------ SMOTE Syn Pos Data ------")
# train_syn_data, val_syn_data, test_syn_data = utils.preprocess_data(syn_data,
#                                                                     cols=[selected_features,y_cols],
#                                                                     split_config=[.8, .1, .1],
#                                                                     pos_split_config=[0, 0, .1],
#                                                                     pre_shuffle=False,
#                                                                     to_numpy=False,
#                                                                     random_state=random_state,
#                                                                     debug_on=True)

------ Main Data ------

Train Pos: 417 | 0.85 || Val Pos: 53 | 0.11 || Test Pos: 22 | 0.04

Total Records: 284807 | Train: 0.80 | Val: 0.10 | Test: 0.10


------ Positives All Test Data ------

Train Pos: 0 | 0.00 || Val Pos: 0 | 0.00 || Test Pos: 492 | 1.00

Total Records: 284807 | Train: 0.80 | Val: 0.10 | Test: 0.10


------ SMOTE Syn Data ------

Train Pos: 170589 | 0.60 || Val Pos: 56863 | 0.20 || Test Pos: 56863 | 0.20

Total Records: 568630 | Train: 0.80 | Val: 0.10 | Test: 0.10


## Modeling

### Tress & Forests

In [None]:
models = {}

def add_model(model_name,model_class,model_args):
    models[model_name] = {"model":globals()[model_class],
                          "args": model_args}
    return

def train_models(dataset):
    trained_models = {}

    for k, v in models.items():

        model = v["model"](**v["args"])

        trained_models[k] = model.fit(dataset.x,dataset.y)

    return trained_models

def evaluate_models(models, dataset):

    evaluated_models = {}

    for k, v in models.items():

        preds = v.predict(dataset.x)
        evaluated_models[k] = {'preds': preds, 'metrics': model_metrics(dataset.y, preds, k)}

    return evaluate_models

In [None]:
dec_tree_args = {"criterion":"entropy",
                 "random_state":random_state}

bs_forest_args = {"criterion":"entropy",
                  "bootstrap": True,
                  "n_estimators":100,
                  "n_jobs":100,
                  "verbose":0,
                  "random_state":random_state}

full_forest_args = {"criterion":"entropy",
                    "bootstrap": False,
                    "n_estimators":100,
                    "n_jobs":100,
                    "verbose":0,
                    "random_state":random_state}

gb_args = {"n_estimators":100,
           "learning_rate":1.0,
           "max_depth":1,
           "random_state":random_state}

# Decision Tree Classifier
add_model("dec_tree", "DecisionTreeClassifier", dec_tree_args)

# Bootstrapped Random Forest
add_model("bs_forest", "RandomForestClassifier", bs_forest_args)

# Bootstrapped Random Forest
add_model("full_forest", "RandomForestClassifier", full_forest_args)

# Gradient Boosting Classifier
add_model("gb", "GradientBoostingClassifier", gb_args)

In [None]:
trained_models = train_models(train_data)
trained_syn_models = train_models(train_syn_data)

In [None]:
evaluated_models = evaluate_models(trained_models, test_data)

In [None]:
syn_evaluated_models = evaluate_models(trained_syn_models, test_syn_data)

### Autoencoders

In [None]:
class Autoencoder(tf_models.Model):

    def __init__(self, n_features):

        super(Autoencoder, self).__init__()

        self.encoder = tf.keras.Sequential(name="encoder")
        
        self.encoder.add(layers.Dense(64, activation='relu'))
        self.encoder.add(layers.Dense(32, activation='relu'))
        self.encoder.add(layers.Dense(16, activation='relu'))
        self.encoder.add(layers.Dense(8, activation='relu'))
        self.encoder.add(layers.Dense(4, activation='relu'))
        self.encoder.add(layers.Dense(2, activation='relu'))

        self.decoder = tf.keras.Sequential(name="decoder")

        self.decoder.add(layers.Dense(4, activation='relu'))
        self.decoder.add(layers.Dense(8, activation='relu'))
        self.decoder.add(layers.Dense(16, activation='relu'))
        self.decoder.add(layers.Dense(32, activation='relu'))
        self.decoder.add(layers.Dense(n_features, activation=activations.sigmoid))

    def call(self, x):

        encoded = self.encoder(x)
        decoded = self.decoder(encoded)

        return decoded

    def set_threshold(self, x, loss):
        recs = self.predict(x)
        train_loss = loss(recs, x)
        self.threshold = (np.mean(train_loss) + np.std(train_loss))

    def rec_predict(self, x, loss, threshold_scaler=1):
        recs = self.predict(x)
        loss = loss(recs, x)
        return tf.math.greater(loss, self.threshold*threshold_scaler), recs, loss

ae_model = Autoencoder(len(selected_features))
ae_model.compile(optimizer=optimizers.Adam(), loss=losses.MeanSquaredError())

In [None]:
history = ae_model.fit(train_data.x, train_data.x,
                       epochs=200,
                       batch_size=256,
                       shuffle=True,
                       validation_data=[val_data.x, val_data.x])

In [None]:
ae_model.set_threshold(train_data.x, losses.mae)

In [None]:
ae_preds, ae_recs, ae_loss = ae_model.rec_predict(test_data.x, losses.mae, 2)

In [None]:
ae_metrics = model_metrics(test_data.y, ae_preds, "AE")

In [None]:
ae_loss

In [None]:
ae_model.threshold