In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [10]:
# Load the data
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Ensure the respondent_id is not considered a feature
X_train = train_features.drop('respondent_id', axis=1)
y_train_xyz = train_labels['xyz_vaccine']
y_train_seasonal = train_labels['seasonal_vaccine']
X_test = test_features.drop('respondent_id', axis=1)

In [11]:
# Keep track of respondent IDs for test set
test_respondent_ids = test_features['respondent_id'].copy()

In [12]:
# Identify categorical and numeric columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()

In [13]:
# Handle missing values and encode categorical variables
# Pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [14]:
# Pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:
# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [16]:
# Apply transformations to training and test data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [0]:
svm_xyz = SVC(kernel='linear', C=1.0, random_state=1, probability=True)
svm_xyz.fit(X_train, y_train_xyz)

In [0]:
svm_seasonal = SVC(kernel='linear', C=1.0, random_state=1, probability=True)
svm_seasonal.fit(X_train, y_train_seasonal)

In [0]:
# Make predictions on the test set for both targets
y_pred_xyz = svm_xyz.predict_proba(X_test)[:, 1]
y_pred_seasonal = svm_seasonal.predict_proba(X_test)[:, 1]



In [0]:
# Save the predictions to the submission file
submission = pd.DataFrame({
    'respondent_id': test_respondent_ids,
    'xyz_vaccine': y_pred_xyz,
    'seasonal_vaccine': y_pred_seasonal
})
submission.to_csv('submission_format.csv', index=False)