In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import roc_auc_score


train = pd.read_csv('https://drive.google.com/uc?export=download&id=1fJ1cg01yPKYumyDh5yRlG_xhiP7vGk9B')
train_labels = pd.read_csv('https://drive.google.com/uc?export=download&id=1ZYBo0SaKikwroJ06eL5puB8ktQvth7md')
test = pd.read_csv('https://drive.google.com/uc?export=download&id=1JsFQB9XFD2q91WIqacx7H_ifeGYhbUNA')

# Merge labels with train data
train = train.merge(train_labels, on='respondent_id')

# Separate features and target variables
X = train.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_xyz = train['xyz_vaccine'].values
y_s = train['seasonal_vaccine'].values
X_test = test.drop(columns=['respondent_id'])

# Impute missing values with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
X_i = imputer.fit_transform(X)
X_test_i = imputer.transform(X_test)

# Convert to DataFrame
X_i = pd.DataFrame(X_i, columns=X.columns)
X_test_i = pd.DataFrame(X_test_i, columns=X_test.columns)

# Encode categorical variables
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':
        le = LabelEncoder()
        X_i[column] = le.fit_transform(X_i[column])
        X_test_i[column] = le.transform(X_test_i[column])
        label_encoders[column] = le

# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(X_i)
X_test = scaler.transform(X_test_i)

# Logistic Regression
lr_xyz = LogisticRegression(max_iter=1000)
lr_xyz.fit(X, y_xyz)
lr_seasonal = LogisticRegression(max_iter=1000)
lr_seasonal.fit(X, y_s)

# Calculate ROC AUC for Logistic Regression
roc_auc_xyz_lr = roc_auc_score(y_xyz, lr_xyz.predict_proba(X)[:, 1])
roc_auc_seasonal_lr = roc_auc_score(y_s, lr_seasonal.predict_proba(X)[:, 1])
print(f"ROC AUC in Logistic Regression: {(roc_auc_xyz_lr + roc_auc_seasonal_lr) / 2:.3f}")

# Predictions for Logistic Regression
xyz_vaccine_lr = lr_xyz.predict_proba(X_test)[:, 1]
seasonal_vaccine_lr = lr_seasonal.predict_proba(X_test)[:, 1]

# Gaussian Naive Bayes
gnb_xyz = GaussianNB()
gnb_seasonal = GaussianNB()

gnb_xyz.fit(X, y_xyz)
gnb_seasonal.fit(X, y_s)

# Calculate ROC AUC for Gaussian Naive Bayes
roc_auc_xyz_gnb = roc_auc_score(y_xyz, gnb_xyz.predict_proba(X)[:, 1])
roc_auc_seasonal_gnb = roc_auc_score(y_s, gnb_seasonal.predict_proba(X)[:, 1])
print(f"ROC AUC in Gaussian Naive Bayes: {(roc_auc_xyz_gnb + roc_auc_seasonal_gnb) / 2:.3f}")

# Predictions for Gaussian Naive Bayes
xyz_vaccine_gnb = gnb_xyz.predict_proba(X_test)[:, 1]
seasonal_vaccine_gnb = gnb_seasonal.predict_proba(X_test)[:, 1]

# Support Vector Machine
svm_xyz = svm.SVC(probability=True, kernel='rbf')
svm_seasonal = svm.SVC(probability=True, kernel='rbf')

svm_xyz.fit(X, y_xyz)
svm_seasonal.fit(X, y_s)

# Calculate ROC AUC for SVM
roc_auc_xyz_svm = roc_auc_score(y_xyz, svm_xyz.predict_proba(X)[:, 1])
roc_auc_seasonal_svm = roc_auc_score(y_s, svm_seasonal.predict_proba(X)[:, 1])
print(f"ROC AUC in SVM: {(roc_auc_xyz_svm + roc_auc_seasonal_svm) / 2:.3f}")

# Predictions for SVM
xyz_vaccine_svm = svm_xyz.predict_proba(X_test)[:, 1]
seasonal_vaccine_svm = svm_seasonal.predict_proba(X_test)[:, 1]

# Prepare submission
test_ids = test['respondent_id']

submission_lr = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine': xyz_vaccine_lr,
    'seasonal_vaccine': seasonal_vaccine_lr
})
submission_lr.to_csv('submission_lr.csv', index=False)

submission_gnb = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine': xyz_vaccine_gnb,
    'seasonal_vaccine': seasonal_vaccine_gnb
})
submission_gnb.to_csv('submission_gnb.csv', index=False)

submission_svm = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine': xyz_vaccine_svm,
    'seasonal_vaccine': seasonal_vaccine_svm
})
submission_svm.to_csv('submission_svm.csv', index=False)


ROC AUC in Logistic Regression: 0.840
ROC AUC in Gaussian Naive Bayes: 0.793
ROC AUC in SVM: 0.904
