In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [5]:
import pandas as pd


train_features = pd.read_csv('/content/drive/MyDrive/dataset/training_set_features.csv')
train_labels = pd.read_csv('/content/drive/MyDrive/dataset/training_set_labels.csv')
test_features = pd.read_csv('/content/drive/MyDrive/dataset/training_set_features.csv')


train_features.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [6]:
train_labels.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [7]:
test_features.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [10]:
binary_col = ['behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask',
               'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
               'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
               'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance']
ordinal_col = ['xyz_concern', 'xyz_knowledge', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
                'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk',
                'opinion_seas_sick_from_vacc']

nominal_col = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status',
                'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
                'employment_industry', 'employment_occupation']

In [11]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('bin', binary_transformer, binary_col),
        ('ord', ordinal_transformer, ordinal_col),
        ('nom', nominal_transformer, nominal_col)
    ])


In [12]:
X_train = train_features.drop(columns=['respondent_id'])
y_train = train_labels[['xyz_vaccine', 'seasonal_vaccine']]
X_test = test_features.drop(columns=['respondent_id'])

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [17]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

base_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

model = MultiOutputClassifier(base_classifier)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

clf.fit(X_train, y_train)


In [20]:
from sklearn.metrics import roc_auc_score

pred_proba = clf.predict_proba(X_val)

prob_xyz_vac = pred_proba[0][:, 1]
prob_seas_vac = pred_proba[1][:, 1]

ra_xyz_vac = roc_auc_score(y_val['xyz_vaccine'], prob_xyz_vac)
ra_seas_vac = roc_auc_score(y_val['seasonal_vaccine'], prob_seas_vac)

mean_ra = (ra_xyz_vac + ra_seas_vac) / 2

print(f'Validation ROC AUC for xyz_vaccine: {ra_xyz_vac}')
print(f'Validation ROC AUC for seasonal_vaccine: {ra_seas_vac}')
print(f'Mean ROC AUC: {mean_ra}')


Validation ROC AUC for xyz_vaccine: 0.8266526737765675
Validation ROC AUC for seasonal_vaccine: 0.8507858841314672
Mean ROC AUC: 0.8387192789540173


In [21]:
t_prob = clf.predict_proba(X_test)

t_prob_xyz_vac = t_prob[0][:, 1]
t_prob_seas_vac = t_prob[1][:, 1]

submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': t_prob_xyz_vac,
    'seasonal_vaccine': t_prob_seas_vac
})

submission.to_csv('submission.csv', index=False)
