In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import roc_auc_score

In [3]:
train_features = pd.read_csv('training_set_features.csv')
test_features = pd.read_csv('test_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')

In [4]:
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [5]:
target_columns = ['xyz_vaccine', 'seasonal_vaccine']
X = train_data.drop(columns=['respondent_id'] + target_columns)
y = train_data[target_columns]

In [6]:
categorical_cols=train_data.select_dtypes(include=['object']).columns
ordinal_cols = ['xyz_concern', 'xyz_knowledge', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
                'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk',
                'opinion_seas_sick_from_vacc']
binary_cols = ['behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands',
             'behavioral_large_gatherings','behavioral_outside_home','behavioral_touch_face','doctor_recc_xyz',
             'doctor_recc_seasonal','chronic_med_condition','child_under_6_months','health_worker','health_insurance']

In [7]:
transform = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), binary_cols + ordinal_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])

In [8]:
categorical_nb = CategoricalNB()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
y_train1 = y_train['xyz_vaccine']
y_train2 = y_train['seasonal_vaccine']

In [11]:
y_test1 = y_test['xyz_vaccine']
y_test2 = y_test['seasonal_vaccine']

In [25]:
pipeline_categorical = Pipeline(steps=[
    ('transform', transform),
    ('classifier', categorical_nb)
])

In [26]:
pipeline_categorical.fit(X_train, y_train1)
pipeline_categorical.fit(X_train, y_train2)

In [27]:
categorical_y_xyz = pipeline_categorical.predict(X_test)
categorical_y_seasonal = pipeline_categorical.predict(X_test)

In [28]:
categorical_roc_auc_xyz = roc_auc_score(y_test1, categorical_y_xyz)
categorical_roc_auc_seasonal = roc_auc_score(y_test2, categorical_y_seasonal)
mean_roc_auc_categorical = (categorical_roc_auc_xyz + categorical_roc_auc_seasonal) / 2

In [29]:
print(f"ROC AUC for xyz_vaccine: {categorical_roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {categorical_roc_auc_seasonal}")
print(f"Mean ROC AUC: {mean_roc_auc_categorical}")

ROC AUC for xyz_vaccine: 0.6715927522712184
ROC AUC for seasonal_vaccine: 0.7475640506186916
Mean ROC AUC: 0.709578401444955


In [30]:
for_pred = test_features.drop(columns=['respondent_id'])

In [31]:
xyz_pred = pipeline_categorical.predict_proba(for_pred)[:,0]
seasonal_pred = pipeline_categorical.predict_proba(for_pred)[:,1]

In [32]:
submission3 = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': xyz_pred,
    'seasonal_vaccine': seasonal_pred
})

In [33]:
submission3.to_csv('submission3.csv', index=False)