In [None]:
import os, sys
import tensorflow as tf, seaborn as sns
from sklearn import feature_selection as skfs
import matplotlib.pyplot as plt
import statsmodels.api as sm

if not os.path.exists("plasma"):
    !git clone https://github.com/sequenzia/plasma.git

import plasma
from plasma import utils

app = plasma.App()
data = app.datasets[0]

### Helper Functions

In [None]:
def plot_corr_fn(data):
    plt.figure(figsize=(25,14))
    sns.heatmap(data.corr(method='spearman'),annot=True,cmap='YlGnBu')
    plt.show()

def corr_output_fn(data,cols):
    data_corr = data[cols].corr(method='spearman')
    data_corr_dict = data_corr.to_dict()

    corr_map = {}

    for k, v in data_corr_dict.items():
        for k2, v2 in v.items():
            if k2 != k:
                if (k2,k) not in corr_map:
                    corr_map[(k,k2)] = v2
                    
    corr_map = sorted(corr_map.items(), reverse=True, key=lambda kv: kv[1])

    plot_corr_fn(data[cols])

    print('\nTop 20 Correlated Features\n')
    for k, v in corr_map[:20]:
        print(f'{k}: {v:.7f}')

def compare_means(data_1,data_2,alpha,test_type='t'):
    """ 
    t-test or z-test of means

    H0 (null): data_1.mean == data_2.mean
    HA (alertantive): data_1.mean > data_2.mean 

    p-val < alpha reject null

    cl = (1-pval)*100

    returns data_1_stats,
            data_2_stats,
            tstats,
            pvals,
            pvals-alpha,
            cls,

    """

    data_1_stats = sm.stats.DescrStatsW(data_1)
    data_2_stats = sm.stats.DescrStatsW(data_2)

    compare = sm.stats.CompareMeans(data_1_stats,data_2_stats)

    if test_type == 'z':
        tstats, pvals = compare.ztest_ind('larger')
    else:
        tstats, pvals, ddf = compare.ttest_ind('larger')

    return {"data_1_stats":data_1_stats,
            "data_2_stats":data_2_stats,
            "compare": compare,
            "tstats": tstats,
            "pvals": pvals,
            "pvals_alpha":pvals-alpha,
            "cls": (1-pvals)*100}

def features_anova_test():
    test_stats = skfs.f_classif(data[x_cols],data[y_cols[0]])
    p_val = .05

    selected_features = {'features': [], 'test_vals':[], 'p_vals':[]}
    for i, col in enumerate(x_cols):

        cur_test_val = test_stats[0][i]
        cur_p_val = test_stats[1][i]

        print(f"Feature: {col} | test_stat = {cur_test_val:.2f} p_val = {cur_p_val:.5f}")

        if cur_p_val < p_val:
            print(f"Reject Null Hypothesis")
        else:
            print(f"Fail to reject Null Hypothesis")
            selected_features['features'].append(col)
            selected_features['test_vals'].append(cur_test_val)
            selected_features['p_vals'].append(cur_p_val)

        print("\n")

    print(selected_features)


### Feature Selection

In [None]:
utils.set_options(5)

x_cols = ['Time',
          'Amount',
          'V1',
          'V2',
          'V3',
          'V4',
          'V5',
          'V6',
          'V7',
          'V8',
          'V9',
          'V10',
          'V11',
          'V12',
          'V13',
          'V14',
          'V15',
          'V16',
          'V17',
          'V18',
          'V19',
          'V20',
          'V21',
          'V22',
          'V23',
          'V24',
          'V25',
          'V26',
          'V27',
          'V28']

y_cols = ['Class']

In [None]:
selector_anova_tp = skfs.SelectPercentile(skfs.f_classif, percentile=25)
selected_features = list(selector_anova_tp.fit(data[x_cols],data[y_cols[0]]).get_feature_names_out())

In [None]:
print(selected_features)

['V3', 'V7', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17']


### Split into Train, Val, Test

In [None]:
split_data = utils.split_data(data,
                              [0,.15],
                              [selected_features,y_cols],
                              pre_shuffle=True,
                              to_numpy=False,
                              pos_dist=False,
                              debug_on=True)

split_data = utils.preprocess_data(split_data)

train_data = split_data['train']
val_data = split_data['val']
test_data = split_data['test']


Train Pos: 418 | 0.85 || Val Pos: 0 | 0.00 || Test Pos: 74 | 0.15

Total Records: 284807 | Train: 0.85 | Val: 0.00 | Test: 0.15
