In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score




In [None]:
# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [None]:
# Separate features and target variables
X = train.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y = train[['xyz_vaccine', 'seasonal_vaccine']]
test_id = test['respondent_id']
X_test = test.drop(['respondent_id'], axis=1)



In [None]:
# Preprocessing pipelines for numeric and categorical data
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])



In [None]:
# Append classifier to preprocessing pipeline
# Now we have a full prediction pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])



In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Train the model
clf.fit(X_train, y_train)



In [None]:
# Predict probabilities
y_pred_train = clf.predict_proba(X_val)


In [None]:

# Convert prediction to the appropriate format for multilabel ROC AUC
y_pred_train_xyz = y_pred_train[0][:, 1]
y_pred_train_seasonal = y_pred_train[1][:, 1]


In [None]:
# Evaluate the model
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_train_xyz)
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_train_seasonal)
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f'Mean ROC AUC Score: {mean_roc_auc}')

In [None]:
# Predict on the test set
y_pred_test = clf.predict_proba(X_test)
test['xyz_vaccine'] = y_pred_test[0][:, 1]
test['seasonal_vaccine'] = y_pred_test[1][:, 1]

In [None]:
# Prepare submission file
submission = test[['respondent_id', 'xyz_vaccine', 'seasonal_vaccine']]
submission.to_csv('submission.csv', index=False)