In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

In [3]:
# Load datasets
training_set_features = pd.read_csv(r"C:\Documents\PRITHWISH\training_set_labels.csv")
#training_set_features = pd.read_csv('dataset/training_set_features.csv')
training_set_labels = pd.read_csv(r"C:\Documents\PRITHWISH\training_set_features.csv")
#training_set_labels = pd.read_csv('dataset/training_set_labels.csv')
test_set_features = pd.read_csv(r"C:\Documents\PRITHWISH\test_set_features.csv")
#test_set_features = pd.read_csv('dataset/test_set_features.csv')
submission_format = pd.read_csv(r"C:\Documents\PRITHWISH\submission_format.csv")
#submission_format = pd.read_csv('dataset/submission_format.csv')

In [5]:
training_set_features.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [6]:
training_set_labels.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [7]:
submission_format.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7
3,26710,0.5,0.7
4,26711,0.5,0.7


In [8]:
# Merge the training features and labels
df_train = pd.merge(training_set_features, training_set_labels, on='respondent_id')

X = df_train.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = df_train[['xyz_vaccine', 'seasonal_vaccine']]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Identify categorical and numerical columns
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [10]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [11]:
# Define Naive Bayes models for each target separately (assuming GaussianNB)
nb_xyz = GaussianNB()
nb_seasonal = GaussianNB()

# Pipeline for xyz_vaccine using Naive Bayes
pipeline_xyz = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', nb_xyz)
])

# Pipeline for seasonal_vaccine using Naive Bayes
pipeline_seasonal = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', nb_seasonal)
])

# Split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the models
pipeline_xyz.fit(X_train, y_train['xyz_vaccine'])
pipeline_seasonal.fit(X_train, y_train['seasonal_vaccine'])

# Predict probabilities on validation set
y_valid_pred_xyz = pipeline_xyz.predict_proba(X_valid)[:, 1]
y_valid_pred_seasonal = pipeline_seasonal.predict_proba(X_valid)[:, 1]

# Calculate ROC AUC for both targets
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_valid_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_valid_pred_seasonal)

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')


ROC AUC for xyz_vaccine: 0.7058655632033213
ROC AUC for seasonal_vaccine: 0.7386638367979186


In [12]:
# Prepare test set for submission
X_test = test_set_features.drop(columns=['respondent_id'])

# Predict probabilities on test set
y_test_pred_xyz = pipeline_xyz.predict_proba(X_test)[:, 1]
y_test_pred_seasonal = pipeline_seasonal.predict_proba(X_test)[:, 1]

# Prepare submission dataframe
df_submission = pd.DataFrame({
    'respondent_id': test_set_features['respondent_id'],
    'xyz_vaccine': y_test_pred_xyz,
    'seasonal_vaccine': y_test_pred_seasonal
})

# Save submission file
#submission_file_path = r"C:\Users\PRITHWISH\Desktop\submission_format.csv"  
submission_file_path = "submission_format.csv"
df_submission.to_csv(submission_file_path, index=False)

print(f'Submission occured Successfully')

Submission occured Successfully
