In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_parquet('../data/01_raw/metrics_df.parquet')


In [3]:
labels_df = pd.read_csv("../data/01_raw/accounts_labels.csv").set_index('account_id')

metrics_df = df.join(labels_df, how='inner')

print(metrics_df.head())

            in_degree  out_degree   total_in  total_out  avg_retention_hours  \
account_id                                                                     
acc014958           4           2   15828.16    2693.18           159.031944   
acc032333           2           2    3246.27    3525.78          -368.813750   
acc021462           0           2       0.00   31889.71          9999.000000   
acc002326           4           4   83173.59   72816.52           -48.398333   
acc004656          23           1  438247.08    1438.85          -908.078333   

               ratio  is_suspect  is_fraud    role  
account_id                                          
acc014958   0.170151           0         0  Honest  
acc032333   1.086099           0         0  Honest  
acc021462   0.000000           0         0  Honest  
acc002326   0.875476           0         1    Mule  
acc004656   0.003283           0         1    Boss  


In [4]:
X = metrics_df[['in_degree', 'out_degree', 'total_in', 'ratio', 'avg_retention_hours']]

y = metrics_df['is_fraud'] # Em uma situação real essa informação não estaria disponível, mas já que eu tenho...

In [5]:
# Conjunto de treino == 70%
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=12345)

In [6]:
# Conjunto de Validação == 15%
# Conjunto de Teste == 15%
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=12345)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, recall_score, precision_score

In [8]:
# Regressão Logística
model_lr = LogisticRegression(max_iter=10000, class_weight='balanced', random_state=12345)
model_lr.fit(X_train, y_train)
probs_lr = model_lr.predict_proba(X_val)[:, 1] # Para auc-roc
preds_lr = model_lr.predict(X_val)             # Para precision e recall
auc_lr = roc_auc_score(y_val, probs_lr)
recall_lr = recall_score(y_val, preds_lr)
precision_lr = precision_score(y_val, preds_lr)

print('---Métricas de Regressão Logística---')
print(f"AUC-ROC Score: {auc_lr:.2f}")
print(f"Recall Score: {recall_lr:.2f}")
print(f"Precision Score: {precision_lr:.2f}")

---Métricas de Regressão Logística---
AUC-ROC Score: 0.98
Recall Score: 0.92
Precision Score: 0.75


In [9]:
# Random Forest
model_rf = RandomForestClassifier(class_weight='balanced', random_state=12345)
model_rf.fit(X_train, y_train)
probs_rf = model_rf.predict_proba(X_val)[:, 1]
preds_rf = model_rf.predict(X_val)
auc_rf = roc_auc_score(y_val, probs_rf)
recall_rf = recall_score(y_val, preds_rf)
precision_rf = precision_score(y_val, preds_rf)

print('---Métricas de Random Forest---')
print(f"AUC-ROC Score: {auc_rf:.2f}")
print(f"Recall Score: {recall_rf:.2f}")
print(f"Precision Score: {precision_rf:.2f}")

---Métricas de Random Forest---
AUC-ROC Score: 0.97
Recall Score: 0.88
Precision Score: 0.97


In [10]:
# XGB
model_xgb = XGBClassifier(
    n_estimators=100, 
    random_state=12345, 
    learning_rate=0.05,
    reg_alpha=1.0, # diminui peso de features muito fortes
    reg_lambda=1.0
)
model_xgb.fit(X_train, y_train)
probs_xgb = model_xgb.predict_proba(X_val)[:, 1]
preds_xgb = model_xgb.predict(X_val)
auc_xgb = roc_auc_score(y_val, probs_xgb)
recall_xgb = recall_score(y_val, preds_xgb)
precision_xgb = precision_score(y_val, preds_xgb)

print('---Métricas de XGB---')
print(f"AUC-ROC Score: {auc_xgb:.2f}")
print(f"Recall Score: {recall_xgb:.2f}")
print(f"Precision Score: {precision_xgb:.2f}")


---Métricas de XGB---
AUC-ROC Score: 0.98
Recall Score: 0.88
Precision Score: 0.97


In [11]:
probs_test = model_xgb.predict_proba(X_test)[:, 1]
preds_test = model_xgb.predict(X_test)

auc_test = roc_auc_score(y_test, probs_test)
recall_test = recall_score(y_test, preds_test)
precision_test = precision_score(y_test, preds_test)

print('---Métricas de Teste para XGB---')
print(f"AUC-ROC Score: {auc_test:.2f}")
print(f"Recall Score: {recall_test:.2f}")
print(f"Precision Score: {precision_test:.2f}")

---Métricas de Teste para XGB---
AUC-ROC Score: 0.99
Recall Score: 0.92
Precision Score: 0.98


In [None]:
# Exportando em csv para Visualização e Power BI
best_model = model_xgb

all_probs = best_model.predict_proba(X)[:, 1]
all_preds = best_model.predict(X)

df_export = X.copy()
df_export['probabilidade_fraude'] = all_probs
df_export['predicao_classe'] = all_preds
df_export['is_fraud_real'] = y

cols_extras = ['role']
df_export = df_export.join(metrics_df[cols_extras], how='left')

df_export.to_csv('../data/01_raw/resultados_finais_modelagem.csv', index=True)


✅ Arquivo atualizado com as colunas necessárias!
