In [1]:
import pandas as pd

# Load datasets
train_features = pd.read_csv('Data/training_set_features.csv')
test_features = pd.read_csv('Data/test_set_features.csv')
train_labels = pd.read_csv('Data/training_set_labels.csv')

In [2]:
print("Train Features:")
print(train_features.head())

print("\nTest Features:")
print(test_features.head())

print("\nTrain Labels:")
print(train_labels.head())


Train Features:
   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0           

In [3]:
print("Missing values in Train Features:")
print(train_features.isnull().sum())

print("\nMissing values in Train Labels:")
print(train_labels.isnull().sum())

# Check for missing values in test features
print("\nMissing values in Test Features:")
print(test_features.isnull().sum())

Missing values in Train Features:
respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
inco

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [5]:
categorical_cols = train_features.select_dtypes(include=['object']).columns
numerical_cols = train_features.select_dtypes(include=['int64', 'float64']).columns

In [6]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [8]:
X_train_processed = preprocessor.fit_transform(train_features)
X_test_processed = preprocessor.transform(test_features)

In [9]:
y_train_xyz = train_labels['xyz_vaccine']
y_train_seasonal = train_labels['seasonal_vaccine']

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [11]:
X_train_xyz, X_val_xyz, y_train_xyz, y_val_xyz = train_test_split(X_train_processed, y_train_xyz, test_size=0.2, random_state=42)
X_train_seasonal, X_val_seasonal, y_train_seasonal, y_val_seasonal = train_test_split(X_train_processed, y_train_seasonal, test_size=0.2, random_state=42)

model_xyz = RandomForestClassifier(random_state=42)
model_xyz.fit(X_train_xyz, y_train_xyz)
model_seasonal = RandomForestClassifier(random_state=42)
model_seasonal.fit(X_train_seasonal, y_train_seasonal)

y_pred_xyz = model_xyz.predict_proba(X_val_xyz)[:, 1]
y_pred_seasonal = model_seasonal.predict_proba(X_val_seasonal)[:, 1]
roc_auc_xyz = roc_auc_score(y_val_xyz, y_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_val_seasonal, y_pred_seasonal)
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC Score for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC Score for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC Score: {mean_roc_auc}')

ROC AUC Score for xyz_vaccine: 0.8634221440637371
ROC AUC Score for seasonal_vaccine: 0.8547197855554478
Mean ROC AUC Score: 0.8590709648095924


In [12]:
test_pred_xyz = model_xyz.predict_proba(X_test_processed)[:, 1]
test_pred_seasonal = model_seasonal.predict_proba(X_test_processed)[:, 1]
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_xyz,
    'seasonal_vaccine': test_pred_seasonal
})
submission.to_csv('submission.csv', index=False)