In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

In [3]:
# Load the datasets
training_labels_path = 'training_set_labels.csv'
test_features_path = 'test_set_features.csv'
submission_format_path = 'submission_format.csv'

training_labels_df = pd.read_csv(training_labels_path)
test_features_df = pd.read_csv(test_features_path)
submission_format_df = pd.read_csv(submission_format_path)

In [4]:
# Merge training features and labels
train_features_path = test_features_path.replace("test_set_features.csv", "training_set_features.csv")
train_features_df = pd.read_csv(train_features_path)
train_df = pd.merge(train_features_df, training_labels_df, on="respondent_id")

In [5]:
# Feature columns
feature_cols = train_features_df.columns.drop('respondent_id')

In [6]:
# Preprocess the data
def preprocess(df, is_train=True):
    df = df.copy()
    
    # Impute missing values
    imputer = SimpleImputer(strategy='most_frequent')
    df[feature_cols] = imputer.fit_transform(df[feature_cols])
    
    # Encode categorical variables
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    
    if is_train:
        # Normalize numerical features
        scaler = StandardScaler()
        df[feature_cols] = scaler.fit_transform(df[feature_cols])
    
    return df

In [7]:
# Preprocess the training data
train_df = preprocess(train_df)
X = train_df[feature_cols]
y = train_df[['xyz_vaccine', 'seasonal_vaccine']]

In [8]:
# Split into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Train a multilabel classification model
model = MultiOutputClassifier(RandomForestClassifier(random_state=42))
model.fit(X_train, y_train)

In [10]:
# Validate the model
y_pred_proba = model.predict_proba(X_valid)
y_pred_proba = np.column_stack([y_pred_proba[i][:, 1] for i in range(len(y_pred_proba))])

In [11]:
# Calculate ROC AUC score
roc_auc = roc_auc_score(y_valid, y_pred_proba, average='macro')
print(f'Validation ROC AUC Score: {roc_auc}')

Validation ROC AUC Score: 0.8405018217266975


In [12]:
# Preprocess the test data
test_df = preprocess(test_features_df, is_train=False)

In [13]:
# Make predictions on the test set
test_pred_proba = model.predict_proba(test_df[feature_cols])
test_pred_proba = np.column_stack([test_pred_proba[i][:, 1] for i in range(len(test_pred_proba))])

In [14]:
# Prepare the submission file
submission_df = submission_format_df.copy()
submission_df['xyz_vaccine'] = test_pred_proba[:, 0]
submission_df['seasonal_vaccine'] = test_pred_proba[:, 1]
submission_df.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!
