In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, loguniform

from sklearn.metrics import (
    classification_report, 
    roc_auc_score, 
    roc_curve,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    PrecisionRecallDisplay
)


In [0]:
df = pd.read_csv('/Workspace/Users/nagahamasami@gmail.com/creditas-case/data/processed/clean-dataset.csv')
df.head()

In [0]:
TARGET = 'sent_to_analysis'
y = df[TARGET]
X = df.drop(columns=[TARGET, 'id', 'pre_approved']) 

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.25,      
    random_state=42,    
    stratify=y
)

In [0]:
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"\n{len(numeric_features)} features numéricas.")
print(f"{len(categorical_features)} features categóricas.")

In [0]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [0]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first')) 
])

In [0]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' 
)

In [0]:
logreg_model = LogisticRegression(
    class_weight='balanced', 
    random_state=42, 
    max_iter=2000,
    solver='liblinear' 
)

In [0]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', logreg_model)
])

In [0]:
pipeline.fit(X_train, y_train)


In [0]:
y_pred_tuned = random_search.predict(X_test)
y_pred_proba_tuned = random_search.predict_proba(X_test)[:, 1]

In [0]:
print(classification_report(y_test, y_pred))

In [0]:
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC no conjunto de teste: {auc_score:.4f}")

In [0]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.classes_)
disp.plot(cmap='Blues')
plt.title('Matriz de Confusão - Regressão Logística')
plt.show()

In [0]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Curva ROC (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('Taxa de Falsos Positivos')
plt.ylabel('Taxa de Verdadeiros Positivos')
plt.title('Curva ROC')
plt.legend(loc="lower right")
plt.show()

In [0]:
fig, ax = plt.subplots(figsize=(8, 6))
PrecisionRecallDisplay.from_predictions(y_test, y_pred_proba, ax=ax, name="Logistic Regression")
plt.title('Curva Precision-Recall')
plt.show()