In [280]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [281]:
x_df = pd.read_csv("data/training_set_features.csv")
test_df = pd.read_csv("data/test_set_features.csv")
x_df = x_df.drop(columns="respondent_id")
test_id = test_df["respondent_id"]
test_df = test_df.drop(columns="respondent_id")

In [282]:
x_df.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [283]:
def na2mode(df):
    for col in df.columns:
        df[col].fillna(df[col].mode()[0], inplace=True)

In [284]:
def object2num(df, col):
    mp = {}
    lst = df[col].unique()
    for idx, val in enumerate(lst):
        mp[val] = idx
    df[col] = df[col].map(mp)

In [285]:
def col2num(df):
    col_to_change = []
    for col in df.columns:
        if(df[col].dtype == "object"):
            col_to_change.append(col)

    for col in col_to_change:
        object2num(df, col)

In [286]:
na2mode(x_df)
col2num(x_df)

In [287]:
x_df.shape

(26707, 35)

In [288]:
from sklearn.metrics import roc_auc_score
from tensorflow.keras.callbacks import Callback

class ROCAUCCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        # Get the validation data from the validation_data attribute
        val_data = self.validation_data[0]
        val_labels = self.validation_data[1]
        val_predictions = self.model.predict(val_data)
        
        # Calculate ROC AUC score
        roc_auc = roc_auc_score(val_labels, val_predictions)
        
        # Log the ROC AUC score
        print(f' — val_roc_auc: {roc_auc:.4f}')

In [289]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the neural network model
def build_model(input_shape):
    model = Sequential()
    model.add(Dense(128, input_shape=input_shape, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(2, activation='sigmoid'))  # Output layer for multilabel classification
    return model


In [290]:
from tensorflow.keras.metrics import AUC
input_shape = (x_df.shape[1],)
model = build_model(input_shape)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy', AUC(name='auc')])

# Summary of the model
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [291]:
y_df = pd.read_csv("data/training_set_labels.csv")
y_df.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [292]:
y_df = y_df.drop(columns="respondent_id")
y_df.shape

(26707, 2)

In [293]:
history = model.fit(x_df, y_df, epochs=500, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/500
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.3072 - auc: 0.6980 - loss: 0.6071 - val_accuracy: 0.2941 - val_auc: 0.8200 - val_loss: 0.4820
Epoch 2/500
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3023 - auc: 0.8083 - loss: 0.4954 - val_accuracy: 0.2935 - val_auc: 0.8324 - val_loss: 0.4738
Epoch 3/500
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3039 - auc: 0.8321 - loss: 0.4694 - val_accuracy: 0.2937 - val_auc: 0.8382 - val_loss: 0.4619
Epoch 4/500
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.2979 - auc: 0.8341 - loss: 0.4678 - val_accuracy: 0.2958 - val_auc: 0.8386 - val_loss: 0.4615
Epoch 5/500
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3030 - auc: 0.8376 - loss: 0.4645 - val_accuracy: 0.3079 - val_auc: 0.8440 - val_loss: 0.4568
Epoch 6/500
[1m668/668[

In [294]:
na2mode(test_df)
col2num(test_df)

In [295]:
test_df.shape

(26708, 35)

In [296]:
y_pred_probs = model.predict(test_df)
y_pred_probs

[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


array([[0.07843258, 0.43845132],
       [0.00254274, 0.00608393],
       [0.14586149, 0.29229647],
       ...,
       [0.16007785, 0.2276026 ],
       [0.03177316, 0.13447803],
       [0.49400944, 0.69132817]], dtype=float32)

In [297]:
y_pred_probs.shape

(26708, 2)

In [298]:
y_pred = pd.DataFrame(y_pred_probs, columns=['h1n1_vaccine', 'seasonal_vaccine'])

In [299]:
y_pred.head()

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0.078433,0.438451
1,0.002543,0.006084
2,0.145861,0.292296
3,0.474728,0.636221
4,0.6815,0.773865


In [300]:
test_id = pd.DataFrame(test_id, columns=['respondent_id'])

In [301]:
test_id.head()

Unnamed: 0,respondent_id
0,26707
1,26708
2,26709
3,26710
4,26711


In [302]:
combined = pd.concat([test_id, y_pred], axis=1)

In [303]:
combined.shape

(26708, 3)

In [304]:
combined.to_csv('submission.csv', index=False)