In [23]:
# Importação das bibliotecas necessárias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, roc_auc_score, cohen_kappa_score, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_curve, auc

# Fixando a seed para reprodutibilidade
np.random.seed(42)

In [24]:
# Função para calcular o R^2 para classificadores
def pseudo_r2(y_true, y_pred_proba):
    ll_null = np.sum(np.log(np.mean(y_true) * np.ones(len(y_true))))  # log-likelihood nula
    ll_model = np.sum(np.log(y_pred_proba))  # log-likelihood do modelo
    return 1 - (ll_model / ll_null)

In [25]:
# Função para avaliar o modelo
def evaluate_model(y_true, y_pred, y_pred_proba):
    accuracy = accuracy_score(y_true, y_pred)
    
    # AUC para multiclasse, especificando 'ovr' (one-vs-rest)
    try:
        auc_score = roc_auc_score(y_true, y_pred_proba, multi_class='ovr')
    except ValueError:
        auc_score = None  # Se o cálculo do AUC não for possível para algum caso específico
    
    cohen_k = cohen_kappa_score(y_true, y_pred)
    r2 = pseudo_r2(y_true, y_pred_proba)

    return accuracy, auc_score, cohen_k, r2

In [26]:
# Carregar os dados
df = pd.read_csv('../data/weaving_rejection_dataset_updated.csv')

In [27]:
# Separando as variáveis independentes (X) e a variável dependente (y)
X = df.drop('Rejection', axis=1)
y = df['Rejection']

In [28]:
# Dividindo os dados em treino e teste com seed fixa
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [29]:
# Escalando as variáveis
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [30]:
### 1. Modelo de Regressão Logística sem PCA
log_reg = LogisticRegression(random_state=42, multi_class='ovr', solver='lbfgs')  # multiclasse suportada
log_reg.fit(X_train_scaled, y_train)

LogisticRegression(multi_class='ovr', random_state=42)

In [31]:
# Previsão
y_pred = log_reg.predict(X_test_scaled)
y_pred_proba = log_reg.predict_proba(X_test_scaled)[:, 1]  # Probabilidade da classe positiva

In [32]:
# Avaliação do modelo
accuracy, auc_score, cohen_k, r2 = evaluate_model(y_test, y_pred, y_pred_proba)

In [35]:
# Avaliação do modelo sem PCA
print(f"### Regressão Logística sem PCA ###")
print(f"Accuracy: {accuracy:.4f}")
if auc_score is not None:
    print(f"AUC: {auc_score:.4f}")
else:
    print("AUC: N/A")
print(f"Cohen's Kappa: {cohen_k:.4f}")
print(f"Pseudo R²: {r2:.4f}")

### Regressão Logística sem PCA ###
Accuracy: 0.3159
AUC: N/A
Cohen's Kappa: 0.1032
Pseudo R²: 2.5736


In [36]:
### 2. Modelo de Regressão Logística com PCA
# Aplicar PCA
pca = PCA(n_components=0.95, random_state=42)  # Mantém 95% da variância
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [37]:
# Treinar o modelo com PCA
log_reg_pca = LogisticRegression(random_state=42)
log_reg_pca.fit(X_train_pca, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [38]:
# Previsão
y_pred_pca = log_reg_pca.predict(X_test_pca)
y_pred_proba_pca = log_reg_pca.predict_proba(X_test_pca)[:, 1]

In [39]:
# Avaliação do modelo com PCA
accuracy_pca, auc_score_pca, cohen_k_pca, r2_pca = evaluate_model(y_test, y_pred_pca, y_pred_proba_pca)

In [40]:
# Avaliação do modelo com PCA
print(f"\n### Regressão Logística com PCA ###")
print(f"Accuracy: {accuracy_pca:.4f}")
if auc_score_pca is not None:
    print(f"AUC: {auc_score_pca:.4f}")
else:
    print("AUC: N/A")
print(f"Cohen's Kappa: {cohen_k_pca:.4f}")
print(f"Pseudo R²: {r2_pca:.4f}")


### Regressão Logística com PCA ###
Accuracy: 0.3265
AUC: N/A
Cohen's Kappa: 0.1201
Pseudo R²: 2.8211
