In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_parquet('../data/01_raw/metrics_df.parquet')


In [3]:
labels_df = pd.read_csv("../data/01_raw/accounts_labels.csv").set_index('account_id')

metrics_df = df.join(labels_df, how='inner')

print(metrics_df.head())

            in_degree  out_degree  total_in  total_out  avg_retention_hours  \
account_id                                                                    
acc599093           8           4  44956.33   33279.32           138.147569   
acc194554           6           5  10606.53   25456.37            22.116546   
acc729547           3           5   3789.99   33196.35           168.574259   
acc933654           9           6  65752.06    6654.72          -313.236806   
acc838332           5           2   9436.99    1159.69            89.337556   

               ratio  is_suspect  is_fraud    role  
account_id                                          
acc599093   0.740259           0         0  Honest  
acc194554   2.400064           0         0  Honest  
acc729547   8.758931           0         0  Honest  
acc933654   0.101209           0         0  Honest  
acc838332   0.122888           0         0  Honest  


In [6]:
X = metrics_df[['in_degree', 'out_degree', 'total_in', 'ratio', 'avg_retention_hours']]

y = metrics_df['is_fraud'] # Em uma situação real essa informação não estaria disponível, mas já que eu tenho...

In [7]:
# Conjunto de treino == 70%
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=12345)

In [8]:
# Conjunto de Validação == 15%
# Conjunto de Teste == 15%
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=12345)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, recall_score, precision_score

In [10]:
# Regressão Logística
model_lr = LogisticRegression(max_iter=10000, class_weight='balanced', random_state=12345)
model_lr.fit(X_train, y_train)
probs_lr = model_lr.predict_proba(X_val)[:, 1] # Para auc-roc
preds_lr = model_lr.predict(X_val)             # Para precision e recall
auc_lr = roc_auc_score(y_val, probs_lr)
recall_lr = recall_score(y_val, preds_lr)
precision_lr = precision_score(y_val, preds_lr)

print('---Métricas de Regressão Logística---')
print(f"AUC-ROC Score: {auc_lr:.2f}")
print(f"Recall Score: {recall_lr:.2f}")
print(f"Precision Score: {precision_lr:.2f}")

---Métricas de Regressão Logística---
AUC-ROC Score: 1.00
Recall Score: 1.00
Precision Score: 0.82


In [11]:
# Random Forest
model_rf = RandomForestClassifier(class_weight='balanced', random_state=12345)
model_rf.fit(X_train, y_train)
probs_rf = model_rf.predict_proba(X_val)[:, 1]
preds_rf = model_rf.predict(X_val)
auc_rf = roc_auc_score(y_val, probs_rf)
recall_rf = recall_score(y_val, preds_rf)
precision_rf = precision_score(y_val, preds_rf)

print('---Métricas de Random Forest---')
print(f"AUC-ROC Score: {auc_rf:.2f}")
print(f"Recall Score: {recall_rf:.2f}")
print(f"Precision Score: {precision_rf:.2f}")

---Métricas de Random Forest---
AUC-ROC Score: 1.00
Recall Score: 1.00
Precision Score: 0.82


In [12]:
# XGBM
model_xgb = XGBClassifier(n_estimators=100, random_state=12345, learning_rate=0.05)
model_xgb.fit(X_train, y_train)
probs_xgb = model_xgb.predict_proba(X_val)[:, 1]
preds_xgb = model_xgb.predict(X_val)
auc_xgb = roc_auc_score(y_val, probs_xgb)
recall_xgb = recall_score(y_val, preds_xgb)
precision_xgb = precision_score(y_val, preds_xgb)

print('---Métricas de XGBM---')
print(f"AUC-ROC Score: {auc_xgb:.2f}")
print(f"Recall Score: {recall_xgb:.2f}")
print(f"Precision Score: {precision_xgb:.2f}")


---Métricas de XGBM---
AUC-ROC Score: 1.00
Recall Score: 0.95
Precision Score: 0.93


In [15]:
probs_test = model_xgb.predict_proba(X_test)[:, 1]
preds_test = model_xgb.predict(X_test)

auc_test = roc_auc_score(y_test, probs_test)
recall_test = recall_score(y_test, preds_test)
precision_test = precision_score(y_test, preds_test)

print('---Métricas de Teste para XGBM---')
print(f"AUC-ROC Score: {auc_test:.2f}")
print(f"Recall Score: {recall_test:.2f}")
print(f"Precision Score: {precision_test:.2f}")

---Métricas de Teste para XGBM---
AUC-ROC Score: 1.00
Recall Score: 0.96
Precision Score: 0.91
