In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score, make_scorer

In [2]:
train_features = pd.read_csv('training_set_features.csv')
test_features = pd.read_csv('test_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')

In [3]:
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   xyz_concern                  26615 non-null  float64
 2   xyz_knowledge                26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_xyz              24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [5]:
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

In [6]:
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [7]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [9]:
X_preprocessed = preprocessor.fit_transform(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [21]:
svm_model = SVC(probability=True, kernel='linear', random_state=42)
multi_output_svm = MultiOutputClassifier(svm_model, n_jobs=-1)
multi_output_svm.fit(X_train, y_train)

In [25]:
y_label_pred = multi_output_svm.predict(X_test)
y_xyz = y_label_pred[:,0]
y_seasonal = y_label_pred[:,1]

In [26]:
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_xyz)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_seasonal)
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

In [27]:
print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC: {mean_roc_auc}")

ROC AUC for xyz_vaccine: 0.6895435712544857
ROC AUC for seasonal_vaccine: 0.7807174194284067
Mean ROC AUC: 0.7351304953414461


In [49]:
for_pred = test_features.drop(columns=['respondent_id'])

In [50]:
test_preprocessed = preprocessor.fit_transform(for_pred)

In [51]:
y_pred_prob = multi_output_svm.predict_proba(test_preprocessed)

In [54]:
y_pred_prob

[array([[0.93539841, 0.06460159],
        [0.98222372, 0.01777628],
        [0.64314701, 0.35685299],
        ...,
        [0.82630772, 0.17369228],
        [0.93778033, 0.06221967],
        [0.39231363, 0.60768637]]),
 array([[0.70965636, 0.29034364],
        [0.95314356, 0.04685644],
        [0.49408283, 0.50591717],
        ...,
        [0.79278791, 0.20721209],
        [0.6470033 , 0.3529967 ],
        [0.5       , 0.5       ]])]

In [66]:
xyz_pred = y_pred_prob[0][:,0]
seasonal_pred = y_pred_prob[1][:,1]

In [67]:
submission5 = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': xyz_pred,
    'seasonal_vaccine': seasonal_pred
})

In [68]:
submission5.to_csv('submission5.csv', index=False)