In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# Loading the dataset 
train_x = pd.read_csv('train_x.csv')
train_y = pd.read_csv('train_y.csv')

data = pd.merge(train_x, train_y, on='respondent_id')

In [3]:
# Splitting data into training and testing sets
X = data.drop(['respondent_id','xyz_vaccine','seasonal_vaccine'], axis=1)
y = data[['xyz_vaccine','seasonal_vaccine']]


In [4]:
# Preprocessing steps
categorical_features = ['age_group','education','race','sex','income_poverty','marital_status','rent_or_own','employment_status', 
                        'hhs_geo_region','census_msa','employment_industry','employment_occupation']

preprocessor = ColumnTransformer( transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')


In [5]:
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))])


In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Training the model
model.fit(X_train, y_train)


In [None]:
# Predicting the probabilities
y_pred_proba = model.predict_proba(X_test)


In [None]:
# Probabilities for each label
y_pred_proba_xyz = [prob[1] for prob in y_pred_proba[0]]
y_pred_proba_seasonal = [prob[1] for prob in y_pred_proba[1]]


In [None]:
# Calculating the ROC(reciever operating characteristic) AUC(area under the curve) scores
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_proba_xyz)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_proba_seasonal)
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2


In [None]:
print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')


In [None]:
# Preparing submission file
test_x = pd.read_csv('test_x.csv')
test_pred_proba = model.predict_proba(test_x.drop(['respondent_id'], axis=1))


In [None]:
submission = pd.DataFrame({
    'respondent_id': test_x['respondent_id'],
    'xyz_vaccine': [prob[1] for prob in test_pred_proba[0]],
    'seasonal_vaccine': [prob[1] for prob in test_pred_proba[1]]
})

submission.to_csv('submission.csv', index=False)
