In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score


In [4]:
train_features = pd.read_csv("Downloads/dataset and all/training_set_features.csv")
train_labels = pd.read_csv("Downloads/dataset and all/training_set_labels.csv")
test_features = pd.read_csv("Downloads/dataset and all/test_set_features.csv")


In [6]:
train_df = pd.merge(train_features, train_labels, on='respondent_id')


In [8]:
y_xyz = train_df['xyz_vaccine']
y_seasonal = train_df['seasonal_vaccine']


In [10]:
train_df.drop(columns=['respondent_id'], inplace=True)
test_ids = test_features['respondent_id']
test_features.drop(columns=['respondent_id'], inplace=True)


In [12]:
cat_cols = train_df.select_dtypes(include='object').columns.tolist()
num_cols = train_df.select_dtypes(exclude='object').columns.tolist()
num_cols.remove('xyz_vaccine')
num_cols.remove('seasonal_vaccine')


In [14]:
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

train_df[num_cols] = num_imputer.fit_transform(train_df[num_cols])
train_df[cat_cols] = cat_imputer.fit_transform(train_df[cat_cols])
test_features[num_cols] = num_imputer.transform(test_features[num_cols])
test_features[cat_cols] = cat_imputer.transform(test_features[cat_cols])


In [16]:
le = LabelEncoder()
for col in cat_cols:
    train_df[col] = le.fit_transform(train_df[col])
    test_features[col] = le.transform(test_features[col])


In [18]:
X = train_df.drop(columns=['xyz_vaccine', 'seasonal_vaccine'])
X_test = test_features


In [20]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)


In [22]:
X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X_scaled, y_xyz, test_size=0.2, random_state=42)
_, _, y_train_seasonal, y_val_seasonal = train_test_split(X_scaled, y_seasonal, test_size=0.2, random_state=42)


In [24]:
def train_and_evaluate(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    return auc


In [26]:
models = {
    'SVM': SVC(probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Naive Bayes': GaussianNB()
}

results = {}
for name, model in models.items():
    print(f'Training {name}...')
    auc_xyz = train_and_evaluate(model, X_train, y_train_xyz, X_val, y_val_xyz)
    auc_seasonal = train_and_evaluate(model, X_train, y_train_seasonal, X_val, y_val_seasonal)
    results[name] = {
        'auc_xyz': auc_xyz,
        'auc_seasonal': auc_seasonal
    }
    print(f'{name} - ROC AUC (xyz_vaccine): {auc_xyz:.4f}')
    print(f'{name} - ROC AUC (seasonal_vaccine): {auc_seasonal:.4f}')
    print('')


Training SVM...
SVM - ROC AUC (xyz_vaccine): 0.8361
SVM - ROC AUC (seasonal_vaccine): 0.8547

Training Logistic Regression...
Logistic Regression - ROC AUC (xyz_vaccine): 0.8312
Logistic Regression - ROC AUC (seasonal_vaccine): 0.8509

Training Naive Bayes...
Naive Bayes - ROC AUC (xyz_vaccine): 0.7860
Naive Bayes - ROC AUC (seasonal_vaccine): 0.8056



In [28]:
best_model_name = max(results, key=lambda k: results[k]['auc_xyz'] + results[k]['auc_seasonal'])
best_model = models[best_model_name]


In [37]:
best_model.fit(X_scaled, y_xyz)
y_pred_xyz = best_model.predict_proba(X_test_scaled)[:, 1]
best_model.fit(X_scaled, y_seasonal)
y_pred_seasonal = best_model.predict_proba(X_test_scaled)[:, 1]


In [33]:
submission = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine': y_pred_xyz,
    'seasonal_vaccine': y_pred_seasonal
})

submission.to_csv('Downloads/dataset and all/submission_format.csv', index=False)
