In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

# Load data
data = pd.read_csv('training_set_features.csv')
dataL = pd.read_csv('training_set_labels.csv')

# Separate features and target variables
X = data.drop(columns=['respondent_id'])
y = dataL[['xyz_vaccine', 'seasonal_vaccine']]

# Identify categorical and numerical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

# Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier()))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)

# Predict probabilities
y_pred_proba = model.predict_proba(X_test)

# Convert list of tuples to 2D array
y_pred_proba = np.array([proba[:, 1] for proba in y_pred_proba]).T

# Evaluate model
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_proba[:, 0])
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_proba[:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'Mean ROC AUC: {mean_roc_auc}')

Test_csv=pd.read_csv('test_set_features.csv')
test_proba = model.predict_proba(Test_csv.drop(columns=['respondent_id']))

test_xyz_vaccine_proba = test_proba[0][:, 1]
test_seasonal_vaccine_proba = test_proba[1][:, 1]

# Prepare submission
submission = pd.DataFrame({
    'respondent_id': Test_csv['respondent_id'],
    'xyz_vaccine': test_xyz_vaccine_proba,
    'seasonal_vaccine': test_seasonal_vaccine_proba
})

submission.to_csv('submission.csv', index=False)


Mean ROC AUC: 0.8386894867095358
