In [4]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

# Generate synthetic data
X, y = make_classification(n_samples=400, n_features=20, n_informative=10, n_redundant=5, random_state=42)

# Split the data into training (70%), validation (10%), and test (20%) sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)

# Define the range of L2 regularization strengths
C_param_range = [10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 1, 10, 100, 1000, 10000, 100000, 1000000]

# Function to get AUC-ROC score
def get_auc(model, X, y):
    y_prob = model.predict_proba(X)[:, 1]
    return roc_auc_score(y, y_prob)

# Initialize variables to store the best model and results
best_auc = 0
best_C = None
best_model = None

# Train logistic regression models with different C values
for C in C_param_range:
    model = LogisticRegression(C=C, max_iter=3000, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate on validation set
    val_auc = get_auc(model, X_val, y_val)
    
    if val_auc > best_auc:
        best_auc = val_auc
        best_C = C
        best_model = model

# Retrain the best model on the combined training and validation set
best_model.fit(X_train_val, y_train_val)

# Evaluate the best model on the test set
test_auc = get_auc(best_model, X_test, y_test)

# Print the results
print(f"Best C: {best_C}")
print(f"Validation AUC: {best_auc}")
print(f"Test AUC: {test_auc}")


Best C: 10
Validation AUC: 0.796875
Test AUC: 0.885
