In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [29]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

#Merging the features and labels
train_data = train_features.merge(train_labels, on='respondent_id')

#Separating them
X = train_data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y_xyz = train_data['xyz_vaccine']
y_seasonal = train_data['seasonal_vaccine']

In [30]:
#Only Categorical columns
cat_c = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 
       'employment_industry', 'employment_occupation', 'opinion_xyz_sick_from_vacc', 
       'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc']

#Only Numerical columns
num_c = [col for col in X.columns if col not in categorical_cols]

#Preprocessing numerical data
num_t = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])

#Preprocessing categorical data
cat_t = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])

#Bundle preprocessing numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num', num_t, num_c),('cat', cat_t, cat_c)])


In [31]:
X_train, X_test, y_train_xyz, y_test_xyz, y_train_seasonal, y_test_seasonal = train_test_split(X, y_xyz, y_seasonal, test_size=0.2, random_state=42)

model_xyz = RandomForestClassifier(random_state=42)
model_seasonal = RandomForestClassifier(random_state=42)

pipeline_xyz = Pipeline(steps=[('preprocessor', preprocessor),('model', model_xyz)])

pipeline_seasonal = Pipeline(steps=[('preprocessor', preprocessor),('model', model_seasonal)])

pipeline_xyz.fit(X_train, y_train_xyz)
pipeline_seasonal.fit(X_train, y_train_seasonal)

In [32]:
#Predictions
y_pred_xyz = pipeline_xyz.predict_proba(X_test)[:, 1]
y_pred_seasonal = pipeline_seasonal.predict_proba(X_test)[:, 1]

# Evaluate the models
roc_auc_xyz = roc_auc_score(y_test_xyz, y_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_test_seasonal, y_pred_seasonal)
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz:.2%}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal:.2%}')
print(f'Mean ROC AUC: {mean_roc_auc:.2%}')

ROC AUC for xyz_vaccine: 82.90%
ROC AUC for seasonal_vaccine: 84.97%
Mean ROC AUC: 83.94%


In [34]:
test_predictions_xyz = pipeline_xyz.predict_proba(test_features.drop(['respondent_id'], axis=1))[:, 1]
test_predictions_seasonal = pipeline_seasonal.predict_proba(test_features.drop(['respondent_id'], axis=1))[:, 1]

submission = pd.DataFrame({'respondent_id': test_features['respondent_id'],'xyz_vaccine': test_predictions_xyz,'seasonal_vaccine': test_predictions_seasonal})

submission.to_csv('submission.csv', index=False)