In [9]:
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, f1_score
from scipy.stats import loguniform

In [None]:

train_df = pd.read_csv('train_data.csv')
val_df = pd.read_csv('val_data.csv')
holdout_df = pd.read_csv('holdout_test_data.csv')


target_col = 'is_canceled'
feature_cols = [c for c in train_df.columns if c != target_col]
X_train, y_train = train_df[feature_cols], train_df[target_col]
X_val, y_val = val_df[feature_cols], val_df[target_col]
X_holdout, y_holdout = holdout_df[feature_cols], holdout_df[target_col]

base_clf = LogisticRegression(max_iter=1000, random_state=42)
param_dist = {
    'C': loguniform(1e-4, 1e2),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}
random_search = RandomizedSearchCV(
    estimator=base_clf,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Best hyperparameters:")
print(random_search.best_params_)

best_clf = random_search.best_estimator_
val_preds = best_clf.predict(X_val)
val_probs = best_clf.predict_proba(X_val)[:, 1]
print("\nValidation Classification Report:")
print(classification_report(y_val, val_preds))
print(f"Validation ROC AUC: {roc_auc_score(y_val, val_probs):.4f}")
print(f"Validation F1 Score: {f1_score(y_val, val_preds):.4f}")

holdout_probs = best_clf.predict_proba(X_holdout)[:, 1]
holdout_preds = (holdout_probs >= 0.5).astype(int)
holdout_df['predicted_prob_is_canceled'] = holdout_probs
print("\nHoldout Classification Report:")
print(classification_report(y_holdout, holdout_preds))
print(f"Holdout ROC AUC: {roc_auc_score(y_holdout, holdout_probs):.4f}")
print(f"Holdout F1 Score: {f1_score(y_holdout, holdout_preds):.4f}")



Best hyperparameters:
{'C': 62.20025976819163, 'penalty': 'l2', 'solver': 'saga'}

Validation Classification Report:
              precision    recall  f1-score   support

         0.0       0.84      0.91      0.87     11203
         1.0       0.82      0.71      0.76      6704

    accuracy                           0.83     17907
   macro avg       0.83      0.81      0.82     17907
weighted avg       0.83      0.83      0.83     17907

Validation ROC AUC: 0.9112
Validation F1 Score: 0.7629

Holdout Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.91      0.88     11274
         1.0       0.83      0.72      0.77      6635

    accuracy                           0.84     17909
   macro avg       0.84      0.81      0.82     17909
weighted avg       0.84      0.84      0.84     17909

Holdout ROC AUC: 0.9119
Holdout F1 Score: 0.7669
