In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB

In [2]:
# Loading the training and test datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

In [3]:
# Drop respondent_id since it's not a feature
train_features = train_features.drop(columns=['respondent_id'])
test_respondent_id = test_features['respondent_id']
test_features = test_features.drop(columns=['respondent_id'])

#some values are missing, so I am filling it with mean and most frequent data
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

num_cols = train_features.select_dtypes(include=['int64', 'float64']).columns
cat_cols = train_features.select_dtypes(include=['object']).columns

# Applying imputation
train_features[num_cols] = num_imputer.fit_transform(train_features[num_cols])
train_features[cat_cols] = cat_imputer.fit_transform(train_features[cat_cols])

test_features[num_cols] = num_imputer.transform(test_features[num_cols])
test_features[cat_cols] = cat_imputer.transform(test_features[cat_cols])

# Encoding categorical features
label_encoders = {}
for column in cat_cols:
    le = LabelEncoder()
    train_features[column] = le.fit_transform(train_features[column].astype(str))
    test_features[column] = le.transform(test_features[column].astype(str))
    label_encoders[column] = le

# Separating the target variables
y_xyz = train_labels['xyz_vaccine']
y_seasonal = train_labels['seasonal_vaccine']

In [4]:
# Training a Naive Bayes model for xyz vaccine
model_xyz = GaussianNB()
model_xyz.fit(train_features, y_xyz)

# Training a Naive Bayes model for seasonal vaccine
model_seasonal = GaussianNB()
model_seasonal.fit(train_features, y_seasonal)

In [5]:
# Predicting probabilities for the xyz vaccine
pred_prob_xyz = model_xyz.predict_proba(test_features)[:, 1]

# Predicting probabilities for the seasonal vaccine
pred_prob_seasonal = model_seasonal.predict_proba(test_features)[:, 1]

In [6]:
# Creating a DataFrame with the predictions
predictions = pd.DataFrame({
    'respondent_id': test_respondent_id,
    'xyz_vaccine': pred_prob_xyz,
    'seasonal_vaccine': pred_prob_seasonal
})

# Saving as CSV
predictions.to_csv('test_set_labels.csv', index=False)