### Importing required modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Loading training and test datasets

In [2]:
train_data = pd.read_csv('training_set_features.csv')
test_data = pd.read_csv('test_set_features.csv')
train_data.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

### Importing some more libraries

In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

### Loading the target variables from training dataset

In [4]:
train_label = pd.read_csv('training_set_labels.csv')
target_vars = ['xyz_vaccine', 'seasonal_vaccine']
train_targets = train_label[target_vars]

### Filling up missing values

In [20]:
train_data = train_data.apply(lambda x: x.fillna(x.value_counts().index[0]))
test_data = test_data.apply(lambda x: x.fillna(x.value_counts().index[0]))
train_data.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,False,False,False,False,False,False,False,False,True,False
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,False,False,False,False,False,False,True,False,False,False
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False


### Converting categorical data into numerical form

In [21]:
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)
test_data.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False


### Defining variables(features) and targets

In [25]:
X = train_data.drop(columns=['respondent_id'])
y = train_targets

### Splitting the data into training and testing sets

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training the model using Logistic Regression L1 and L2 regularization

In [31]:
params = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100]
}

### Logistic Regression for xyz_vaccine

In [33]:
xyz = GridSearchCV(LogisticRegression(solver='liblinear', random_state=42), params, cv=5, scoring='roc_auc')
xyz.fit(X_train, y_train['xyz_vaccine'])
xyz_pred = xyz.predict_proba(X_test)[:, 1]
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], xyz_pred)

### Logistic Regression for seasonal_vaccine

In [37]:
seasonal = GridSearchCV(LogisticRegression(solver='liblinear', random_state=42), params, cv=5, scoring='roc_auc')
seasonal.fit(X_train, y_train['seasonal_vaccine'])
seasonal_pred = seasonal.predict_proba(X_test)[:, 1]
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], seasonal_pred)

### Mean ROC AUC score

In [34]:
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

### Displaying ROC AUC scores

In [35]:
print(f'Best ROC AUC for xyz vaccine: {roc_auc_xyz}')
print(f'Best ROC AUC for seasonal vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

Best ROC AUC for xyz vaccine: 0.8316884333845986
Best ROC AUC for seasonal vaccine: 0.850279804472045
Mean ROC AUC: 0.8409841189283218


### Predicting probabilities on the test dataset

In [38]:
xyz_test_pred = xyz.predict_proba(test_data.drop(columns=['respondent_id']))[:, 1]
seasonal_test_pred = seasonal.predict_proba(test_data.drop(columns=['respondent_id']))[:, 1]

### Preparing submission file

In [39]:
submission = pd.DataFrame({
    'respondent_id': test_data['respondent_id'],
    'xyz_vaccine': xyz_test_pred,
    'seasonal_vaccine': seasonal_test_pred
})
submission.to_csv('submission.csv', index=False)