In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score


In [2]:
# Load datasets
#training_set_features = pd.read_csv(r"D:\DOCS\malak\training_set_labels.csv")
training_set_features = pd.read_csv('dataset/training_set_features.csv')
#training_set_labels = pd.read_csv(r"D:\DOCS\malak\training_set_features.csv")
training_set_labels = pd.read_csv('dataset/training_set_labels.csv')
#test_set_features = pd.read_csv(r"D:\DOCS\malak\test_set_features.csv")
test_set_features = pd.read_csv('dataset/test_set_features.csv')
#submission_format = pd.read_csv(r"D:\DOCS\malak\submission_format.csv")
submission_format = pd.read_csv('dataset/submission_format.csv')

In [3]:
training_set_features.columns, training_set_labels.columns, test_set_features.columns, submission_format.columns

(Index(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], dtype='object'),
 Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
        'behavioral_antiviral_meds', 'behavioral_avoidance',
        'behavioral_face_mask', 'behavioral_wash_hands',
        'behavioral_large_gatherings', 'behavioral_outside_home',
        'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
        'chronic_med_condition', 'child_under_6_months', 'health_worker',
        'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
        'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
        'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
        'education', 'race', 'sex', 'income_poverty', 'marital_status',
        'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
        'household_adults', 'household_children', 'employment_industry',
        'employment_occupation'],
       dtype='object'),
 Index(['respondent_id', 'xyz_co

In [4]:
# Merge the training features and labels
df_train = pd.merge(training_set_features, training_set_labels, on='respondent_id')

X = df_train.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = df_train[['xyz_vaccine', 'seasonal_vaccine']]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Identify categorical and numerical columns
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [6]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [10]:
# Define the Logistic Regression models for each target separately
logreg_xyz = LogisticRegression(solver='liblinear', random_state=42)
logreg_seasonal = LogisticRegression(solver='liblinear', random_state=42)

# Pipeline for xyz_vaccine
pipeline_xyz = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', logreg_xyz)
])

# Pipeline for seasonal_vaccine
pipeline_seasonal = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', logreg_seasonal)
])

# Train and evaluate each model
for name, pipeline in [('Logistic Regression - xyz_vaccine', pipeline_xyz), ('Logistic Regression - seasonal_vaccine', pipeline_seasonal)]:
    pipeline.fit(X_train, y_train[name.split(' - ')[-1]])
    
    # Predict probabilities on validation set
    y_valid_pred = pipeline.predict_proba(X_valid)[:, 1]
    
    # Calculate ROC AUC
    roc_auc = roc_auc_score(y_valid[name.split(' - ')[-1]], y_valid_pred)
    
    print(f'\n{name} Results:')
    print(f'ROC AUC: {roc_auc}')


Logistic Regression - xyz_vaccine Results:
ROC AUC: 0.8317842405600517

Logistic Regression - seasonal_vaccine Results:
ROC AUC: 0.8560890655040101


In [13]:
# Prepare test set for submission
X_test = test_set_features.drop(columns=['respondent_id'])

# Predict probabilities on test set
y_test_pred_xyz = pipeline_xyz.predict_proba(X_test)[:, 1]
y_test_pred_seasonal = pipeline_seasonal.predict_proba(X_test)[:, 1]

# Prepare submission dataframe
df_submission = pd.DataFrame({
    'respondent_id': test_set_features['respondent_id'],
    'xyz_vaccine': y_test_pred_xyz,
    'seasonal_vaccine': y_test_pred_seasonal
})

# Save submission file
#submission_file_path = r"D:\DOCS\submission_format.csv" 
submission_file_path = "submission_format.csv" 
df_submission.to_csv(submission_file_path, index=False)

print(f'Submission Done')

Submission Done
