# Objetivo

...

# Análise dos dados

...

# Modelagem

...

In [1]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgbm
import xgboost as xgb

from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_colwidth', None)


In [2]:
df = pd.read_csv('./data/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Análise das variáveis preditivas e limpeza de dados

In [3]:
# Anotações:

# As variáveis são fruto de um PCA segundo a descrição do dataset. Também é bastante claro que o dado está normalizado.
# Não há missings, mas algumas variáveis têm outliers. Como vamos fazer modelos baseados em árvores com limitação de número de casos por nó, 
# o tratamento de outlier se torna desnecessário.

In [4]:
profile = ProfileReport(df, minimal=True)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [5]:
df.columns = df.columns.str.lower()

# Modelagem

### Bases de treino e teste

In [6]:
X = df[['v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'v10',
       'v11', 'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20',
       'v21', 'v22', 'v23', 'v24', 'v25', 'v26', 'v27', 'v28', 'amount']].copy()

y = df['class'].copy()

In [100]:
# O shuffle está como False porque a base está ordenada pelo tempo, e é do meu interesse que a base de teste tenha os dados mais recentes.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [8]:
def ks_metric(y_true, y_pred_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    ks = max(tpr - fpr)
    return ks

### Modelagem com Árvore simples

In [9]:
tree = DecisionTreeClassifier(
    criterion='gini', 
    min_samples_leaf=int(0.05 * len(X_train)),  # Garantir que cada nó tenha pelo menos 5% dos casos
    max_depth=None,
    random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
roc_auc_scores = cross_val_score(tree, X_train, y_train, cv=cv, scoring='roc_auc')
tree.fit(X_train, y_train)

### Otimização de hiperparâmetros dos algortimos Decision Tree, LightGBM e XGBoost

# Decision Tree
dt_param_grid = {
    'min_samples_leaf': [int(0.005 * len(X_train)), int(0.02 * len(X_train)), int(0.05 * len(X_train))],
    'criterion':['gini']
}

# LightGBM
lgb_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'min_data_in_leaf': [int(0.02 * len(X_train)), int(0.05 * len(X_train))],
    'feature_fraction': [0.8, 1.0],
    'bagging_fraction': [0.8, 1.0]
}

# XGBoost
xgb_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'min_child_weight': [int(0.02 * len(X_train)), int(0.05 * len(X_train))],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'eval_metric': ['auc']
}

search_results = {}

for name, model, param_grid in zip(
    ['Decision Tree', 'LightGBM', 'XGBoost'],
    [DecisionTreeClassifier(random_state=42), lgbm.LGBMClassifier(random_state=42), xgb.XGBClassifier(random_state=42, use_label_encoder=False)],
    [dt_param_grid, lgb_param_grid, xgb_param_grid]
):
    grid_search = GridSearchCV(
        model,
        param_grid=param_grid,
        scoring='roc_auc',
        n_jobs=-1,
        cv=5,
        verbose=1
    )

    grid_search.fit(X_train, y_train)
    
    search_results[name] = {
        'Best Params': grid_search.best_params_,
        'Best Score (AUC)': grid_search.best_score_
    }

search_results_df = pd.DataFrame(search_results)
search_results_df

In [10]:
best_params_dt = {'criterion': 'gini', 'min_samples_leaf': 4556}

best_params_lgbm = {'bagging_fraction': 0.8,
 'feature_fraction': 1.0,
 'learning_rate': 0.05,
 'min_data_in_leaf': 11392,
 'n_estimators': 500}

best_params_xgb = {'colsample_bytree': 0.8,
 'eval_metric': 'auc',
 'gamma': 0,
 'learning_rate': 0.01,
 'min_child_weight': 4556,
 'n_estimators': 100,
 'subsample': 1.0}

best_params_dt.update(
    {'random_state':42}
)

best_params_lgbm.update(
    {'random_state':42}
)

best_params_xgb.update(
    {'random_state':42}
)

### Modelagem com hiperparâmetros ajustados

In [11]:
# Anotações sobre modelagem com hiperparâmetros ajustados:

# O LightGBM teve um excelente resultado, com alta possibilidade de overfitting. Por mais que a volumetria de dados esteja bem limitada, 
# tentemos analisar os 3 modelos numa visão "ao longo do tempo", separando a base de teste em 2.

In [12]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = {
    'Decision Tree': DecisionTreeClassifier(**best_params_dt),
    'LightGBM': lgbm.LGBMClassifier(**best_params_lgbm),
    'XGBoost': xgb.XGBClassifier(**best_params_xgb)
}

results = {}
for name, model in models.items():
    auc_scores = []
    ks_scores = []
    for train_index, val_index in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        model.fit(X_tr, y_tr)
        y_val_pred = model.predict_proba(X_val)[:, 1]
        
        # AUC
        auc = roc_auc_score(y_val, y_val_pred)
        auc_scores.append(auc)
        
        # KS
        ks = ks_metric(y_val, y_val_pred)
        ks_scores.append(ks)
    

    # Avaliar no conjunto de treino
    model.fit(X_train, y_train)
    y_train_pred = model.predict_proba(X_train)[:, 1]
    
    train_auc = roc_auc_score(y_train, y_train_pred)
    train_ks = ks_metric(y_train, y_train_pred)
    
    # Avaliar no conjunto de teste
    y_test_pred = model.predict_proba(X_test)[:, 1]
    
    test_auc = roc_auc_score(y_test, y_test_pred)
    test_ks = ks_metric(y_test, y_test_pred)
    
    results[name] = {
        'AUC CV': np.mean(auc_scores),
        'AUC Treino': train_auc,
        'AUC Teste': test_auc,
        'KS CV': np.mean(ks_scores),
        'KS Treino': train_ks,
        'KS Teste': test_ks
    }

final_results_df = pd.DataFrame(results)
final_results_df



Unnamed: 0,Decision Tree,LightGBM,XGBoost
AUC CV,0.95814,0.982035,0.967613
AUC Treino,0.978781,1.0,0.973774
AUC Teste,0.953372,0.984481,0.960118
KS CV,0.849644,0.908806,0.857152
KS Treino,0.851725,1.0,0.854262
KS Teste,0.800527,0.890239,0.82468


### Análise de estabilidade

In [None]:
# Anotações:

# Todos os 3 modelos possuem instabilidade entre os conjuntos de Teste 1 e 2, o que não é um cenário ideal.
# A sugestão é usar o LightGBM por sua clara melhor performance e acompanhá-lo ao longo do tempo, até porque mesmo no pior cenário temos 
# uma taxa de detection de 63% impactando apenas 0,0527% das transações.

In [14]:
df['set'] = 'nulo'
df.loc[X_train.index, 'set'] = 'train'
df.loc[df.iloc[-round(len(X_test)/2):].index, 'set'] = 'test_1'
df.loc[df.iloc[-len(X_test):-round(len(X_test)/2)].index, 'set'] = 'test_2'

In [43]:
df['score_lightgbm'] = models['LightGBM'].predict_proba(df[X_train.columns])[:, 1]*1000
df['score_decision_tree'] = models['Decision Tree'].predict_proba(df[X_train.columns])[:, 1]*1000
df['score_xgboost'] = models['XGBoost'].predict_proba(df[X_train.columns])[:, 1]*1000

In [97]:
bins = [0, 10, 900, 1000]
labels = ['Baixo', 'Médio', 'Alto']

df['score_lightgbm_binned'] = pd.cut(df['score_lightgbm'], bins=bins, labels=labels)

df_grouped = df.groupby(['set', 'score_lightgbm_binned'], observed=False).agg(
    tx_maus=('class', lambda x: x.sum() / x.count()),  
    qtd_casos=('class', 'count'),  
    detection=('class', 'sum')  
)

total_qtd_casos_por_set = df.groupby('set')['class'].count()
df_grouped['perc_casos'] = df_grouped['qtd_casos'] / df_grouped.index.get_level_values('set').map(total_qtd_casos_por_set)

total_maus_por_set = df.groupby('set')['class'].sum()
df_grouped['detection'] = df_grouped['detection'] / df_grouped.index.get_level_values('set').map(total_maus_por_set)

df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,tx_maus,qtd_casos,detection,perc_casos
set,score_lightgbm_binned,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
test_1,Baixo,0.000211,28441,0.272727,0.998596
test_1,Médio,0.08,25,0.090909,0.000878
test_1,Alto,0.933333,15,0.636364,0.000527
test_2,Baixo,0.000352,28425,0.188679,0.998034
test_2,Médio,0.277778,18,0.09434,0.000632
test_2,Alto,1.0,38,0.716981,0.001334
train,Baixo,0.0,227373,0.0,0.997928
train,Médio,0.0,55,0.0,0.000241
train,Alto,1.0,417,1.0,0.00183


In [98]:
bins = [0, 2, 75, 1000]
labels = ['Baixo', 'Médio', 'Alto']

df['score_decision_tree_binned'] = pd.cut(df['score_decision_tree'], bins=bins, labels=labels)

df_grouped = df.groupby(['set', 'score_decision_tree_binned'], observed=False).agg(
    tx_maus=('class', lambda x: x.sum() / x.count()),  
    qtd_casos=('class', 'count'),  
    detection=('class', 'sum')  
)

total_qtd_casos_por_set = df.groupby('set')['class'].count()
df_grouped['perc_casos'] = df_grouped['qtd_casos'] / df_grouped.index.get_level_values('set').map(total_qtd_casos_por_set)

total_maus_por_set = df.groupby('set')['class'].sum()
df_grouped['detection'] = df_grouped['detection'] / df_grouped.index.get_level_values('set').map(total_maus_por_set)

df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,tx_maus,qtd_casos,detection,perc_casos
set,score_decision_tree_binned,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
test_1,Baixo,0.000638,6265,0.181818,0.219971
test_1,Médio,0.000764,1309,0.045455,0.04596
test_1,Alto,0.022378,715,0.727273,0.025104
test_2,Baixo,0.000622,6435,0.075472,0.22594
test_2,Médio,0.001555,1286,0.037736,0.045153
test_2,Alto,0.055156,834,0.867925,0.029283
train,Baixo,0.000703,54078,0.091127,0.237346
train,Médio,0.002268,8820,0.047962,0.038711
train,Alto,0.078797,4556,0.860911,0.019996


In [99]:
bins = [0, 47.6, 47.819, 1000]
labels = ['Baixo', 'Médio', 'Alto']

df['score_xgboost_binned'] = pd.cut(df['score_xgboost'], bins=bins, labels=labels)

df_grouped = df.groupby(['set', 'score_xgboost_binned'], observed=False).agg(
    tx_maus=('class', lambda x: x.sum() / x.count()),  
    qtd_casos=('class', 'count'),  
    detection=('class', 'sum')  
)

total_qtd_casos_por_set = df.groupby('set')['class'].count()
df_grouped['perc_casos'] = df_grouped['qtd_casos'] / df_grouped.index.get_level_values('set').map(total_qtd_casos_por_set)

total_maus_por_set = df.groupby('set')['class'].sum()
df_grouped['detection'] = df_grouped['detection'] / df_grouped.index.get_level_values('set').map(total_maus_por_set)

df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,tx_maus,qtd_casos,detection,perc_casos
set,score_xgboost_binned,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
test_1,Baixo,0.000249,28144,0.318182,0.988168
test_1,Médio,0.0,166,0.0,0.005828
test_1,Alto,0.087719,171,0.681818,0.006004
test_2,Baixo,0.000355,28138,0.188679,0.987957
test_2,Médio,0.01227,163,0.037736,0.005723
test_2,Alto,0.227778,180,0.773585,0.00632
train,Baixo,0.000271,224747,0.146283,0.986403
train,Médio,0.008963,1562,0.033573,0.006856
train,Alto,0.222656,1536,0.820144,0.006741


In [17]:
feature_importance_df = pd.DataFrame({
    'Feature': models['LightGBM'].feature_name_,
    'Importance': models['LightGBM'].feature_importances_
}).sort_values(by='Importance', ascending=False)

In [18]:
feature_importance_df['Feature'].to_list()

['v4',
 'v14',
 'amount',
 'v12',
 'v10',
 'v17',
 'v8',
 'v1',
 'v26',
 'v20',
 'v28',
 'v19',
 'v7',
 'v21',
 'v9',
 'v16',
 'v25',
 'v3',
 'v11',
 'v13',
 'v24',
 'v15',
 'v22',
 'v18',
 'v5',
 'v2',
 'v23',
 'v27',
 'v6']