In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train_features=pd.read_csv('/content/training_set_features.csv')
train_labels=pd.read_csv('/content/training_set_labels.csv')

In [4]:
# Merge training features and labels based on respondent_id
train_data = pd.merge(train_features, train_labels, on='respondent_id')


In [7]:
train_data.columns

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation', 'xyz_vaccine', 'seasonal_vaccine'],
      dtype='object')

In [9]:
# Exclude non-numeric columns from filling missing values with median
numeric_columns = train_data.select_dtypes(include=['number']).columns
train_data[numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].median())


In [10]:
from sklearn.preprocessing import OneHotEncoder

# Categorical columns for encoding
categorical_columns = ['age_group', 'education', 'race', 'sex', 'income_poverty',
                       'marital_status', 'rent_or_own', 'employment_status',
                       'hhs_geo_region', 'census_msa', 'employment_industry',
                       'employment_occupation']

# Fill missing values in categorical columns with mode
train_data[categorical_columns] = train_data[categorical_columns].fillna(train_data[categorical_columns].mode().iloc[0])

# One-hot encode categorical variables
train_data = pd.get_dummies(train_data, columns=categorical_columns, drop_first=True)


In [11]:
# Define features (X) and target variables (y)
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

In [12]:
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize base model
base_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Use MultiOutputClassifier for multilabel classification
model = MultiOutputClassifier(base_model, n_jobs=-1)

# Train the model
model.fit(X_train, y_train)


In [14]:
from sklearn.metrics import roc_auc_score

# Make predictions on validation set
y_pred_proba = model.predict_proba(X_val)

# Extract probabilities for each target
y_pred_proba_xyz = y_pred_proba[0][:, 1]
y_pred_proba_seasonal = y_pred_proba[1][:, 1]

# Calculate ROC AUC score for each target variable
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba_xyz)
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba_seasonal)

# Calculate mean ROC AUC score
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')


ROC AUC for xyz vaccine: 0.8256054131054131
ROC AUC for seasonal vaccine: 0.8505725714138944
Mean ROC AUC: 0.8380889922596537


In [15]:
# Load test set features
test_data = pd.read_csv('test_set_features.csv')

# Fill missing values in numeric columns with median
numeric_columns_test = test_data.select_dtypes(include=['number']).columns
test_data[numeric_columns_test] = test_data[numeric_columns_test].fillna(test_data[numeric_columns_test].median())

# Fill missing values in categorical columns with mode
test_data[categorical_columns] = test_data[categorical_columns].fillna(test_data[categorical_columns].mode().iloc[0])

# One-hot encode categorical variables
test_data = pd.get_dummies(test_data, columns=categorical_columns, drop_first=True)


In [16]:
# Ensure columns in test set match training set
missing_cols = set(X.columns) - set(test_data.columns)
for c in missing_cols:
    test_data[c] = 0
test_data = test_data[X.columns]

# Make predictions on the test set
test_pred_proba = model.predict_proba(test_data)

# Extract probabilities for each target
test_pred_proba_xyz = test_pred_proba[0][:, 1]
test_pred_proba_seasonal = test_pred_proba[1][:, 1]


In [18]:
submission_format = pd.read_csv("/content/submission_format.csv")
submission_format.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7
3,26710,0.5,0.7
4,26711,0.5,0.7


In [20]:
submission_format['h1n1_vaccine'] = test_pred_proba_xyz  # Probabilities for xyz_vaccine
submission_format['seasonal_vaccine'] = test_pred_proba_seasonal  # Probabilities for seasonal_vaccine

# Save the submission file with the correct column names
submission_format.to_csv('submission.csv', index=False)

# Display the head of the submission file to verify
print(submission_format.head())

   respondent_id  h1n1_vaccine  seasonal_vaccine
0          26707          0.22              0.30
1          26708          0.04              0.04
2          26709          0.54              0.76
3          26710          0.55              0.88
4          26711          0.25              0.38
