In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

In [2]:
train_features = pd.read_csv('training_set_features.csv')
test_features = pd.read_csv('test_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')

In [3]:
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [4]:
target_columns = ['xyz_vaccine', 'seasonal_vaccine']
X = train_data.drop(columns=['respondent_id'] + target_columns)
y = train_data[target_columns]

In [5]:
categorical_cols=train_data.select_dtypes(include=['object']).columns
ordinal_cols = ['xyz_concern', 'xyz_knowledge', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
                'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk',
                'opinion_seas_sick_from_vacc']
binary_cols = ['behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands',
             'behavioral_large_gatherings','behavioral_outside_home','behavioral_touch_face','doctor_recc_xyz',
             'doctor_recc_seasonal','chronic_med_condition','child_under_6_months','health_worker','health_insurance']

In [6]:
transform = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), binary_cols + ordinal_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ])

In [7]:
gaussian_nb = GaussianNB()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
y_train1 = y_train['xyz_vaccine']
y_train2 = y_train['seasonal_vaccine']

In [10]:
y_test1 = y_test['xyz_vaccine']
y_test2 = y_test['seasonal_vaccine']

In [15]:
pipeline_gaussian = Pipeline(steps=[
    ('transform', transform),
    ('scaler', StandardScaler()),
    ('classifier', gaussian_nb)
])

In [16]:
pipeline_gaussian.fit(X_train, y_train1)
pipeline_gaussian.fit(X_train, y_train2)

In [17]:
gaussian_y_xyz = pipeline_gaussian.predict(X_test)
gaussian_y_seasonal = pipeline_gaussian.predict(X_test)

In [18]:
gaussian_roc_auc_xyz = roc_auc_score(y_test1, gaussian_y_xyz)
gaussian_roc_auc_seasonal = roc_auc_score(y_test2, gaussian_y_seasonal)
mean_roc_auc_gaussian = (gaussian_roc_auc_xyz + gaussian_roc_auc_seasonal) / 2

In [19]:
print(f"ROC AUC for xyz_vaccine: {gaussian_roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {gaussian_roc_auc_seasonal}")
print(f"Mean ROC AUC: {mean_roc_auc_gaussian}")

ROC AUC for xyz_vaccine: 0.5970116985603712
ROC AUC for seasonal_vaccine: 0.6592488456910055
Mean ROC AUC: 0.6281302721256883


In [20]:
for_pred = test_features.drop(columns=['respondent_id'])

In [22]:
xyz_pred = pipeline_gaussian.predict_proba(for_pred)[:,0]
seasonal_pred = pipeline_gaussian.predict_proba(for_pred)[:,1]

In [23]:
submission2 = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': xyz_pred,
    'seasonal_vaccine': seasonal_pred
})

In [26]:
submission2.to_csv('submission2.csv', index=False)