In [24]:
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

In [25]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

train['BMI'] = train['weight(kg)'] / (train['height(cm)'] / 100) ** 2
test['BMI'] = test['weight(kg)'] / (test['height(cm)'] / 100) ** 2

X = train.drop(columns=['id', 'smoking'])
y = train['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
scaler = StandardScaler()
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_test)
X_train_scaled = scaler.fit_transform(X_train_poly)
X_val_scaled = scaler.transform(X_val_poly)

In [27]:
def objective_log_reg(trial):
    C = trial.suggest_loguniform('C', 1e-4, 1e2)
    solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs'])
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    
    model = LogisticRegression(C=C, solver=solver, max_iter=max_iter, random_state=42)
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc').mean()
    return score

def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                   min_samples_split=min_samples_split, random_state=42)
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc').mean()
    return score

In [28]:
study_log_reg = optuna.create_study(direction='maximize')
study_log_reg.optimize(objective_log_reg, n_trials=20)
best_params_log_reg = study_log_reg.best_params

[I 2024-10-29 20:14:27,950] A new study created in memory with name: no-name-03450f55-23e9-4d5b-9e91-9734cca220b7
  C = trial.suggest_loguniform('C', 1e-4, 1e2)
[I 2024-10-29 20:14:36,832] Trial 0 finished with value: 0.8765843654886 and parameters: {'C': 0.059385380328492846, 'solver': 'liblinear', 'max_iter': 416}. Best is trial 0 with value: 0.8765843654886.
  C = trial.suggest_loguniform('C', 1e-4, 1e2)
[I 2024-10-29 20:14:42,520] Trial 1 finished with value: 0.8756177440872079 and parameters: {'C': 0.028647937136126712, 'solver': 'liblinear', 'max_iter': 540}. Best is trial 0 with value: 0.8765843654886.
  C = trial.suggest_loguniform('C', 1e-4, 1e2)
[I 2024-10-29 20:14:44,103] Trial 2 finished with value: 0.864359605856167 and parameters: {'C': 0.00016102703496309823, 'solver': 'liblinear', 'max_iter': 693}. Best is trial 0 with value: 0.8765843654886.
  C = trial.suggest_loguniform('C', 1e-4, 1e2)
[I 2024-10-29 20:14:54,311] Trial 3 finished with value: 0.8769035864580432 and pa

In [29]:
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=20)
best_params_rf = study_rf.best_params

[I 2024-10-29 20:18:13,824] A new study created in memory with name: no-name-b93a8863-e798-448a-9fbf-3cf08c5516a1
[I 2024-10-29 20:19:33,501] Trial 0 finished with value: 0.8793370117901234 and parameters: {'n_estimators': 369, 'max_depth': 7, 'min_samples_split': 5}. Best is trial 0 with value: 0.8793370117901234.
[I 2024-10-29 20:21:02,816] Trial 1 finished with value: 0.88004816683103 and parameters: {'n_estimators': 214, 'max_depth': 19, 'min_samples_split': 3}. Best is trial 1 with value: 0.88004816683103.
[I 2024-10-29 20:23:14,671] Trial 2 finished with value: 0.8821465445225908 and parameters: {'n_estimators': 403, 'max_depth': 11, 'min_samples_split': 4}. Best is trial 2 with value: 0.8821465445225908.
[I 2024-10-29 20:26:49,728] Trial 3 finished with value: 0.8799813514036092 and parameters: {'n_estimators': 498, 'max_depth': 30, 'min_samples_split': 8}. Best is trial 2 with value: 0.8821465445225908.
[I 2024-10-29 20:28:28,048] Trial 4 finished with value: 0.8794375708747445

In [30]:
best_log_reg = LogisticRegression(**best_params_log_reg, random_state=42)
best_rf = RandomForestClassifier(**best_params_rf, random_state=42)

ensemble_model = VotingClassifier(
    estimators=[('log_reg', best_log_reg), ('rf', best_rf)],
    voting='soft'
)

In [31]:
ensemble_model.fit(X_train_scaled, y_train)
y_val_pred_prob = ensemble_model.predict_proba(X_val_scaled)[:, 1]
roc_auc = roc_auc_score(y_test, y_val_pred_prob)
print(f"Validation ROC-AUC Score: {roc_auc}")       

Validation ROC-AUC Score: 0.8850685012120832


In [40]:
test = test.drop(columns="id", axis=1)

In [41]:
X_test = test.copy()
X_val_poly = poly.transform(X_test)
X_val_scaled = scaler.transform(X_val_poly)

y_proba = ensemble_model.predict_proba(X_val_scaled)[:,1]

In [42]:
submission['smoking'] = y_proba
submission.to_csv('submission_1.csv', index=False)