In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [5]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp37-cp37m-win_amd64.whl (7.1 MB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.1.0
  Downloading scipy-1.7.3-cp37-cp37m-win_amd64.whl (34.1 MB)
Collecting joblib>=0.11
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.0.2 scipy-1.7.3 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\sc941\AppData\Local\Programs\Python\Python37\python.exe -m pip install --upgrade pip' command.


In [2]:
pip install pandas 

Collecting pandas
  Downloading pandas-1.3.5-cp37-cp37m-win_amd64.whl (10.0 MB)
Collecting numpy>=1.17.3; platform_machine != "aarch64" and platform_machine != "arm64" and python_version < "3.10"
  Downloading numpy-1.21.6-cp37-cp37m-win_amd64.whl (14.0 MB)
Collecting pytz>=2017.3
  Downloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
Installing collected packages: numpy, pytz, pandas
Successfully installed numpy-1.21.6 pandas-1.3.5 pytz-2024.1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\sc941\AppData\Local\Programs\Python\Python37\python.exe -m pip install --upgrade pip' command.


In [8]:
# Load the datasets
features_df = pd.read_csv('training_set_features.csv')
labels_df = pd.read_csv('training_set_labels.csv')

In [10]:

# Merge datasets on respondent_id
data = pd.merge(features_df, labels_df, on='respondent_id')

In [11]:
# Define feature columns and target columns
feature_cols = features_df.columns.drop('respondent_id')
target_cols = ['xyz_vaccine', 'seasonal_vaccine']

In [12]:

# Split the data
X = data[feature_cols]
y = data[target_cols]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Preprocessing
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [14]:

# Define the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))])

In [15]:
# Train the model
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_hom...
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['age_group', 'education', 'race', 'sex', 'income_poverty',

In [16]:

# Make predictions
y_pred_proba = model.predict_proba(X_val)

In [17]:

# Calculate ROC AUC
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba[1][:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'Mean ROC AUC: {mean_roc_auc}')

Mean ROC AUC: 0.8406199199127561


In [18]:
# Prepare submission file
test_df = pd.read_csv('test_set_features.csv')  # Assuming you have a test set
test_pred_proba = model.predict_proba(test_df[feature_cols])

submission = pd.DataFrame({
    'respondent_id': test_df['respondent_id'],
    'xyz_vaccine': test_pred_proba[0][:, 1],
    'seasonal_vaccine': test_pred_proba[1][:, 1]
})

submission.to_csv('submission.csv', index=False)

In [19]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Define a cross-validation strategy
cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc_ovo')
mean_cv_score = np.mean(cv_scores)
print(f'Cross-validated Mean ROC AUC: {mean_cv_score}')


Traceback (most recent call last):
  File "c:\Users\sc941\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\sc941\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "c:\Users\sc941\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\metrics\_scorer.py", line 309, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "c:\Users\sc941\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\metrics\_ranking.py", line 546, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "c:\Users\sc941\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\validation.py", line 796, in check_array
    % (array.ndim, estimator_name)
ValueError: Found array

Cross-validated Mean ROC AUC: nan


Traceback (most recent call last):
  File "c:\Users\sc941\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\sc941\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "c:\Users\sc941\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\metrics\_scorer.py", line 309, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "c:\Users\sc941\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\metrics\_ranking.py", line 546, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "c:\Users\sc941\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\validation.py", line 796, in check_array
    % (array.ndim, estimator_name)
ValueError: Found array

In [20]:
from sklearn.metrics import confusion_matrix, classification_report

# Predict on validation set
y_val_pred = model.predict(X_val)

# Confusion matrix for xyz_vaccine
conf_matrix_xyz = confusion_matrix(y_val['xyz_vaccine'], y_val_pred[:, 0])
print('Confusion Matrix for xyz_vaccine:')
print(conf_matrix_xyz)

# Confusion matrix for seasonal_vaccine
conf_matrix_seasonal = confusion_matrix(y_val['seasonal_vaccine'], y_val_pred[:, 1])
print('Confusion Matrix for seasonal_vaccine:')
print(conf_matrix_seasonal)

# Classification report
report_xyz = classification_report(y_val['xyz_vaccine'], y_val_pred[:, 0])
report_seasonal = classification_report(y_val['seasonal_vaccine'], y_val_pred[:, 1])
print('Classification Report for xyz_vaccine:')
print(report_xyz)
print('Classification Report for seasonal_vaccine:')
print(report_seasonal)


Confusion Matrix for xyz_vaccine:
[[4040  172]
 [ 701  429]]
Confusion Matrix for seasonal_vaccine:
[[2341  550]
 [ 623 1828]]
Classification Report for xyz_vaccine:
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      4212
           1       0.71      0.38      0.50      1130

    accuracy                           0.84      5342
   macro avg       0.78      0.67      0.70      5342
weighted avg       0.82      0.84      0.82      5342

Classification Report for seasonal_vaccine:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      2891
           1       0.77      0.75      0.76      2451

    accuracy                           0.78      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.78      0.78      0.78      5342



In [21]:
# Check for missing values after preprocessing
X_train_preprocessed = model.named_steps['preprocessor'].transform(X_train)
print(f'Number of missing values in preprocessed training set: {np.sum(np.isnan(X_train_preprocessed))}')

# Check feature scaling and encoding
print('First 5 rows of the preprocessed training set:')
print(X_train_preprocessed[:5])


Number of missing values in preprocessed training set: 0
First 5 rows of the preprocessed training set:
[[ 0.41923526 -0.42788801 -0.22745927  0.6133972  -0.27498724  0.46236622
  -0.74456928  1.40409693  0.69029205 -0.50569811 -0.66316442  1.62547693
  -0.29411436 -0.35230484 -3.79522715  1.14702612  1.29724496 -0.26288815
  -0.0270496   0.93886135 -0.08991278  0.14653398 -0.57291335  0.
   0.          1.          0.          0.          0.          0.
   0.          1.          0.          0.          0.          1.
   1.          0.          0.          0.          1.          1.
   0.          0.          1.          0.          0.          1.
   0.          0.          0.          0.          0.          0.
   1.          0.          0.          0.          1.          0.
   0.          0.          0.          0.          0.          1.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.         

In [22]:
from sklearn.linear_model import LogisticRegression

# Define and train a baseline model
baseline_model = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))
baseline_model.fit(X_train, y_train)

# Evaluate the baseline model
baseline_y_pred_proba = baseline_model.predict_proba(X_val)
baseline_roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], baseline_y_pred_proba[0][:, 1])
baseline_roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], baseline_y_pred_proba[1][:, 1])
baseline_mean_roc_auc = (baseline_roc_auc_xyz + baseline_roc_auc_seasonal) / 2

print(f'Baseline Mean ROC AUC: {baseline_mean_roc_auc}')


ValueError: could not convert string to float: '45 - 54 Years'