# Treinamento do modelo final para produção

## Instalação de Dependências (se necessário)

In [None]:
# ! pip install scikit-optimizer
# ! pip install imblearn

## Importação de Bibliotecas

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve, average_precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
from imblearn.over_sampling import SMOTE


## Carregamento e Pré-processamento dos Dados

In [2]:
# !curl -L -o ../data/creditcardfraud.zip https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud

In [3]:
# !unzip ../data/creditcardfraud.zip 

In [4]:
# Carregar o dataset
df = pd.read_csv('../data/creditcard.csv')

In [5]:
# Escalar as colunas 'Amount' e 'Time'
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
df['Time'] = scaler.fit_transform(df['Time'].values.reshape(-1, 1))

In [6]:
# Separar features (X) e target (y)
X = df.drop('Class', axis=1)
y = df['Class']

In [7]:
# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Balanceamento de Dados com SMOTE

In [8]:
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X_train, y_train)

## Configuração do Modelo XGBoost

In [11]:

best_params = load('../models/best_params.joblib')
print("Hiperparâmetros carregados com sucesso:", best_params)

Hiperparâmetros carregados com sucesso: OrderedDict({'colsample_bytree': 0.5980275811968299, 'gamma': 0.8442537170001781, 'learning_rate': 0.2825056172779145, 'max_depth': 10, 'min_child_weight': 6, 'n_estimators': 457, 'reg_alpha': 0.00010779280104817701, 'reg_lambda': 0.009484560648525507, 'scale_pos_weight': 70.79417166558567, 'subsample': 0.7943312655651524})


In [12]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='aucpr',
    random_state=42,
    n_jobs=-1,
    **best_params
)

In [13]:
xgb_model.fit(X_resampled, y_resampled)

## Avaliação do Modelo

In [16]:
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
recall = recall_score(y_test, y_pred_proba > 0.5)
ap = average_precision_score(y_test, y_pred_proba)
print(f"AUC no conjunto de teste: {auc:.4f}")
print(f"Average Precision no conjunto de teste: {ap:.4f}")
print(f"Recall no conjunto de teste: {recall:.4f}")

AUC no conjunto de teste: 0.9799
Average Precision no conjunto de teste: 0.8717
Recall no conjunto de teste: 0.8776


## Salvando o Modelo e os Hiperparâmetros

In [17]:
dump(xgb_model, '../models/final_model.joblib')
dump(best_params, '../models/best_params.joblib')
print("Modelo e hiperparâmetros salvos com sucesso!")

Modelo e hiperparâmetros salvos com sucesso!


In [9]:
X_train.to_csv('../data/X_train.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
X_resampled.to_csv('../data/X_resampled.csv', index=False)
y_resampled.to_csv('../data/y_resampled.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)
print("X_train, y_train, X_resampled e y_resampled salvos com sucesso!")


X_train, y_train, X_resampled e y_resampled salvos com sucesso!
