In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


submission_format = pd.read_csv('submission_format.csv')
test_set_features = pd.read_csv('test_set_features.csv')
training_set_features = pd.read_csv('training_set_features.csv')
training_set_labels = pd.read_csv('training_set_labels.csv')


train_data = pd.merge(training_set_features, training_set_labels, on='respondent_id')


categorical_cols = [
    'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation',
    'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own',
    'employment_status'
]

numerical_cols = [
    'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance',
    'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings',
    'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
    'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance',
    'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc',
    'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc',
    'household_adults', 'household_children'
]


numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))


clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])


X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


clf.fit(X_train, y_train)


y_pred = clf.predict_proba(X_val)


roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred[1][:, 1])


mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')


test_set_features = test_set_features.drop(columns=['respondent_id'])
test_preds = clf.predict_proba(test_set_features)


submission_format['xyz_vaccine'] = test_preds[0][:, 1]
submission_format['seasonal_vaccine'] = test_preds[1][:, 1]


submission_format.to_csv('submission.csv', index=False)


ROC AUC for xyz_vaccine: 0.8321716713309634
ROC AUC for seasonal_vaccine: 0.8518754513402149
Mean ROC AUC: 0.8420235613355891
