In [8]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler

from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('./data/creditcard.csv')
data.columns = data.columns.str.lower()
data.head()

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Separando as transações legítimas (fraud = 0) e fraudulentas (fraud = 1)
X = data.drop(['time', 'class'], axis=1).copy()
y = data['class'].copy()

# Dividindo em treino e teste
X_train_main, X_test_main, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = RobustScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_main), columns=X_train_main.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_main), columns=X_test_main.columns)

In [4]:
X_train_scaled = X_train_scaled.reset_index().drop('index', axis=1)
X_test_scaled = X_test_scaled.reset_index().drop('index', axis=1)

y_train = pd.DataFrame(y_train).reset_index().drop('index', axis=1)['class']
y_test = pd.DataFrame(y_test).reset_index().drop('index', axis=1)['class']

In [5]:
feature_importance = ['v4',
 'v14',
 'amount',
 'v17',
 'v12',
 'v7',
 'v10',
 'v8',
 'v26',
 'v3',
 'v19',
 'v16',
 'v2',
 'v22',
 'v21',
 'v25',
 'v13',
 'v6',
 'v23',
 'v20',
 'v5',
 'v24',
 'v28',
 'v15',
 'v27',
 'v11',
 'v18',
 'v9',
 'v1']

In [6]:
# Calcule o KS
def ks_metric(y_true, y_pred_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    ks = max(tpr - fpr)
    return ks

In [20]:
results = {}
hora_inicio = datetime.now()
for features in range(1, len(feature_importance)+1):
    X_train = X_train_scaled[feature_importance[:features]].copy()
    X_test = X_test_scaled[feature_importance[:features]].copy()

    # Construindo o Autoencoder
    input_dim = X_train.shape[1]  # Número de features
    encoding_dim = 4  # Tamanho da camada latente

    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)

    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

    early_stopping = EarlyStopping(monitor='val_loss', patience=3)

    # Treinando o Autoencoder
    history = autoencoder.fit(X_train[y_train == 0], X_train[y_train == 0], 
                            epochs=50, 
                            batch_size=32, 
                            shuffle=True, 
                            validation_data=(X_test[y_test == 0], X_test[y_test == 0]), 
                            callbacks=[early_stopping],
                            verbose=0)
    
    # Calculando o erro de reconstrução (MSE) para o conjunto de teste
    X_train_pred = autoencoder.predict(X_train, verbose=0)
    mse_train = np.mean(np.power(X_train - X_train_pred, 2), axis=1)

    X_test_pred = autoencoder.predict(X_test, verbose=0)
    mse_test = np.mean(np.power(X_test - X_test_pred, 2), axis=1)

    scaler = MinMaxScaler()
    y_train_pred_proba = scaler.fit_transform(mse_train.values.reshape(-1, 1)).ravel()
    y_test_pred_proba = scaler.transform(mse_test.values.reshape(-1, 1)).ravel()

    # Performance do modelo sem idade e score de mercado
    ks_train = ks_metric(y_train, y_train_pred_proba)
    ks_test = ks_metric(y_test, y_test_pred_proba)

    results[features] = {
    'KS Treino': ks_train,
    'KS Teste': ks_test}

    print(f'Features: {features} | KS Treino: {ks_train:.4f} | KS Teste: {ks_test:.4f} | Tempo (min): {round(((datetime.now() - hora_inicio).seconds/60), 2)}')

final_results_df = pd.DataFrame(results)
final_results_df.T

Features: 1 | KS Treino: 0.5263 | KS Teste: 0.4640 | Tempo (min): 6.5
Features: 2 | KS Treino: 0.8475 | KS Teste: 0.8617 | Tempo (min): 12.73
Features: 3 | KS Treino: 0.8204 | KS Teste: 0.8535 | Tempo (min): 18.65
Features: 4 | KS Treino: 0.8325 | KS Teste: 0.8634 | Tempo (min): 24.67
Features: 5 | KS Treino: 0.8316 | KS Teste: 0.8709 | Tempo (min): 30.47
Features: 6 | KS Treino: 0.8373 | KS Teste: 0.8661 | Tempo (min): 36.4
Features: 7 | KS Treino: 0.8349 | KS Teste: 0.8693 | Tempo (min): 45.38
Features: 8 | KS Treino: 0.8267 | KS Teste: 0.8703 | Tempo (min): 55.23
Features: 9 | KS Treino: 0.8287 | KS Teste: 0.8714 | Tempo (min): 61.85
Features: 10 | KS Treino: 0.8283 | KS Teste: 0.8707 | Tempo (min): 68.78
Features: 11 | KS Treino: 0.8281 | KS Teste: 0.8689 | Tempo (min): 75.58
Features: 12 | KS Treino: 0.8281 | KS Teste: 0.8672 | Tempo (min): 82.73
Features: 13 | KS Treino: 0.8261 | KS Teste: 0.8614 | Tempo (min): 89.6
Features: 14 | KS Treino: 0.8243 | KS Teste: 0.8577 | Tempo (min

Unnamed: 0,KS Treino,KS Teste
1,0.526273,0.463965
2,0.847451,0.861666
3,0.820413,0.853507
4,0.832459,0.863376
5,0.831589,0.870864
6,0.837278,0.866098
7,0.834939,0.869303
8,0.826703,0.870274
9,0.828683,0.871382
10,0.828279,0.870718


In [21]:
results = {}
hora_inicio = datetime.now()
lista = feature_importance[:1]
ks_final = 0
for i in range(1, len(feature_importance)+1):

    lista_desafiante = lista + feature_importance[i:i+1]

    X_train = X_train_scaled[lista_desafiante].copy()
    X_test = X_test_scaled[lista_desafiante].copy()

    # Construindo o Autoencoder
    input_dim = X_train.shape[1]  # Número de features
    encoding_dim = 4  # Tamanho da camada latente

    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='tanh')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)

    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='rmsprop', loss='mean_squared_error')

    early_stopping = EarlyStopping(monitor='val_loss', patience=3)

    # Treinando o Autoencoder
    history = autoencoder.fit(X_train[y_train == 0], X_train[y_train == 0], 
                            epochs=50, 
                            batch_size=32, 
                            shuffle=True, 
                            validation_data=(X_test[y_test == 0], X_test[y_test == 0]), 
                            callbacks=[early_stopping],
                            verbose=0)
    
    # Calculando o erro de reconstrução (MSE) para o conjunto de teste
    X_train_pred = autoencoder.predict(X_train, verbose=0)
    mse_train = np.mean(np.power(X_train - X_train_pred, 2), axis=1)

    X_test_pred = autoencoder.predict(X_test, verbose=0)
    mse_test = np.mean(np.power(X_test - X_test_pred, 2), axis=1)

    scaler = MinMaxScaler()
    y_train_pred_proba = scaler.fit_transform(mse_train.values.reshape(-1, 1)).ravel()
    y_test_pred_proba = scaler.transform(mse_test.values.reshape(-1, 1)).ravel()

    # Performance do modelo sem idade e score de mercado
    ks_train = ks_metric(y_train, y_train_pred_proba)
    ks_test = ks_metric(y_test, y_test_pred_proba)

    if ks_train > ks_final:
        ks_final = ks_train
        lista = lista_desafiante

    results[i] = {
    'KS Treino': ks_train,
    'KS Teste': ks_test}

    print(f'Feature: {i} | KS Treino: {ks_train:.4f} | KS Teste: {ks_test:.4f} | Tempo (min): {round(((datetime.now() - hora_inicio).seconds/60), 2)}')

final_results_propagation_df = pd.DataFrame(results)
final_results_propagation_df.T

Feature: 1 | KS Treino: 0.8492 | KS Teste: 0.8508 | Tempo (min): 5.43
Feature: 2 | KS Treino: 0.8179 | KS Teste: 0.8495 | Tempo (min): 10.97
Feature: 3 | KS Treino: 0.8607 | KS Teste: 0.8720 | Tempo (min): 15.03
Feature: 4 | KS Treino: 0.8531 | KS Teste: 0.8644 | Tempo (min): 20.0
Feature: 5 | KS Treino: 0.8489 | KS Teste: 0.8662 | Tempo (min): 24.73
Feature: 6 | KS Treino: 0.8510 | KS Teste: 0.8685 | Tempo (min): 29.63
Feature: 7 | KS Treino: 0.8332 | KS Teste: 0.8740 | Tempo (min): 38.57
Feature: 8 | KS Treino: 0.8601 | KS Teste: 0.8642 | Tempo (min): 43.55
Feature: 9 | KS Treino: 0.8586 | KS Teste: 0.8622 | Tempo (min): 48.02
Feature: 10 | KS Treino: 0.8572 | KS Teste: 0.8737 | Tempo (min): 52.73
Feature: 11 | KS Treino: 0.8496 | KS Teste: 0.8641 | Tempo (min): 55.65
Feature: 12 | KS Treino: 0.8507 | KS Teste: 0.8536 | Tempo (min): 58.68
Feature: 13 | KS Treino: 0.8598 | KS Teste: 0.8678 | Tempo (min): 61.68
Feature: 14 | KS Treino: 0.8395 | KS Teste: 0.8629 | Tempo (min): 65.32
Fea

Unnamed: 0,KS Treino,KS Teste
1,0.849199,0.850777
2,0.817879,0.849479
3,0.860698,0.871989
4,0.85311,0.864445
5,0.848944,0.866243
6,0.850985,0.86849
7,0.833181,0.874012
8,0.860108,0.864199
9,0.858626,0.862229
10,0.857167,0.873713


In [23]:
lista

['v4', 'v14', 'v17']

In [24]:
X_train = X_train_scaled[lista].copy()
X_test = X_test_scaled[lista].copy()

# Construindo o Autoencoder
input_dim = X_train.shape[1]  # Número de features
encoding_dim = 4  # Tamanho da camada latente

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='tanh')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='rmsprop', loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Treinando o Autoencoder
history = autoencoder.fit(X_train[y_train == 0], X_train[y_train == 0], 
                        epochs=50, 
                        batch_size=32, 
                        shuffle=True, 
                        validation_data=(X_test[y_test == 0], X_test[y_test == 0]), 
                        callbacks=[early_stopping],
                        verbose=0)

# Calculando o erro de reconstrução (MSE) para o conjunto de teste
X_train_pred = autoencoder.predict(X_train, verbose=0)
mse_train = np.mean(np.power(X_train - X_train_pred, 2), axis=1)

X_test_pred = autoencoder.predict(X_test, verbose=0)
mse_test = np.mean(np.power(X_test - X_test_pred, 2), axis=1)

scaler = MinMaxScaler()

df_train = X_train.copy()
df_train['score'] = scaler.fit_transform(mse_train.values.reshape(-1, 1)).ravel()
df_train['target'] = y_train

df_test = X_test.copy()
df_test['score'] = scaler.transform(mse_test.values.reshape(-1, 1)).ravel()
df_test['target'] = y_test

In [39]:
X_train = X_train_scaled[lista].copy()
X_test = X_test_scaled[lista].copy()

# Construindo o Autoencoder
optimizer = SGD(learning_rate=4.2423550697632125e-05)

input_dim = X_train.shape[1]  # Número de features
encoding_dim = 10  # Tamanho da camada latente

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='sigmoid')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)

autoencoder.compile(optimizer=optimizer, loss='binary_crossentropy')
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Treinando o Autoencoder
history = autoencoder.fit(X_train[y_train == 0], X_train[y_train == 0], 
                        epochs=50, 
                        batch_size=64, 
                        shuffle=True, 
                        validation_data=(X_test[y_test == 0], X_test[y_test == 0]), 
                        callbacks=[early_stopping],
                        verbose=0)

# Calculando o erro de reconstrução (MSE) para o conjunto de teste
X_train_pred = autoencoder.predict(X_train, verbose=0)
mse_train = np.mean(np.power(X_train - X_train_pred, 2), axis=1)

X_test_pred = autoencoder.predict(X_test, verbose=0)
mse_test = np.mean(np.power(X_test - X_test_pred, 2), axis=1)

scaler = MinMaxScaler()

df_train = X_train.copy()
df_train['score'] = scaler.fit_transform(mse_train.values.reshape(-1, 1)).ravel()
df_train['target'] = y_train

df_test = X_test.copy()
df_test['score'] = scaler.transform(mse_test.values.reshape(-1, 1)).ravel()
df_test['target'] = y_test

In [45]:
ks_train = ks_metric(df_train['target'], df_train['score'])
ks_test = ks_metric(df_test['target'], df_test['score'])

print(f'KS Treino: {ks_train:.4f} | KS Teste: {ks_test:.4f}')

KS Treino: 0.8682 | KS Teste: 0.8866


In [41]:
bins = [0, 0.025, 0.04, 0.08, 1]
labels = ['Baixo', 'Médio', 'Alto', 'Altíssimo']

df_train['score_binned'] = pd.cut(df_train['score'], bins=bins, labels=labels)

df_train.groupby('score_binned').agg({'target': ['sum', 'count']})

  df_train.groupby('score_binned').agg({'target': ['sum', 'count']})


Unnamed: 0_level_0,target,target
Unnamed: 0_level_1,sum,count
score_binned,Unnamed: 1_level_2,Unnamed: 2_level_2
Baixo,63,226613
Médio,18,645
Alto,80,269
Altíssimo,233,317


In [42]:
df_test['score_binned'] = pd.cut(df_test['score'], bins=bins, labels=labels)

df_test.groupby('score_binned').agg({'target': ['sum', 'count']})

  df_test.groupby('score_binned').agg({'target': ['sum', 'count']})


Unnamed: 0_level_0,target,target
Unnamed: 0_level_1,sum,count
score_binned,Unnamed: 1_level_2,Unnamed: 2_level_2
Baixo,14,56654
Médio,4,151
Alto,21,80
Altíssimo,59,77
