In [48]:
import pandas as pd

In [49]:
features = pd.read_csv('/content/training_set_features.csv')
labels = pd.read_csv('/content/training_set_labels.csv')

In [50]:
labels.head(3)

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0


In [51]:
df = features.merge(labels, on='respondent_id')

In [52]:
print("Shape of data:", df.shape)

Shape of data: (26707, 38)


In [53]:
df.head(2).T

Unnamed: 0,0,1
respondent_id,0,1
h1n1_concern,1.0,3.0
h1n1_knowledge,0.0,2.0
behavioral_antiviral_meds,0.0,0.0
behavioral_avoidance,0.0,1.0
behavioral_face_mask,0.0,0.0
behavioral_wash_hands,0.0,1.0
behavioral_large_gatherings,0.0,0.0
behavioral_outside_home,1.0,1.0
behavioral_touch_face,1.0,1.0


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [55]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
respondent_id,26707.0,13353.0,7709.791156,0.0,6676.5,13353.0,20029.5,26706.0
h1n1_concern,26615.0,1.618486,0.910311,0.0,1.0,2.0,2.0,3.0
h1n1_knowledge,26591.0,1.262532,0.618149,0.0,1.0,1.0,2.0,2.0
behavioral_antiviral_meds,26636.0,0.048844,0.215545,0.0,0.0,0.0,0.0,1.0
behavioral_avoidance,26499.0,0.725612,0.446214,0.0,0.0,1.0,1.0,1.0
behavioral_face_mask,26688.0,0.068982,0.253429,0.0,0.0,0.0,0.0,1.0
behavioral_wash_hands,26665.0,0.825614,0.379448,0.0,1.0,1.0,1.0,1.0
behavioral_large_gatherings,26620.0,0.35864,0.47961,0.0,0.0,0.0,1.0,1.0
behavioral_outside_home,26625.0,0.337315,0.472802,0.0,0.0,0.0,1.0,1.0
behavioral_touch_face,26579.0,0.677264,0.467531,0.0,0.0,1.0,1.0,1.0


### hadle Missing Values

In [56]:
# Check missing values
missing = df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)

Unnamed: 0,0
employment_occupation,13470
employment_industry,13330
health_insurance,12274
income_poverty,4423
doctor_recc_seasonal,2160
doctor_recc_h1n1,2160
rent_or_own,2042
employment_status,1463
marital_status,1408
education,1407


#### Handle High Missing features (> 40%):

- `employment_occupation`
- `employment_industry`
- `health_insurance`

In [57]:
df['employment_occupation'] = df['employment_occupation'].fillna('unknown')
df['employment_industry'] = df['employment_industry'].fillna('unknown')
df['health_insurance'] = df['health_insurance'].fillna(2)

#### Handle Moderate Missing features (10% -20%)

In [58]:
moderate_missinig_col = ['income_poverty', 'rent_or_own', 'employment_status', 'marital_status', 'education', 'doctor_recc_h1n1', 'doctor_recc_seasonal']

In [59]:
for col in moderate_missinig_col:
    df[col] = df[col].fillna(df[col].mode()[0])

#### Handle Low missing features (<5%)

In [60]:
low_missing_cols = [
    'chronic_med_condition', 'child_under_6_months', 'health_worker',
    'h1n1_concern', 'h1n1_knowledge',
    'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask',
    'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
    'behavioral_touch_face',
    'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
    'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc',
    'household_adults', 'household_children'
]

In [61]:
for col in low_missing_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [62]:
df.isnull().sum()

Unnamed: 0,0
respondent_id,0
h1n1_concern,0
h1n1_knowledge,0
behavioral_antiviral_meds,0
behavioral_avoidance,0
behavioral_face_mask,0
behavioral_wash_hands,0
behavioral_large_gatherings,0
behavioral_outside_home,0
behavioral_touch_face,0


### Encoding the features

In [63]:
for col in df.select_dtypes(include='object'):
    print(col)


age_group
education
race
sex
income_poverty
marital_status
rent_or_own
employment_status
hhs_geo_region
census_msa
employment_industry
employment_occupation


In [64]:
df['race'].unique()

array(['White', 'Black', 'Other or Multiple', 'Hispanic'], dtype=object)

In [65]:
ForLabel = ['age_group', 'education', 'sex', 'income_poverty', 'marital_status', 'rent_or_own']
ForOneHot = ['race','employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']

In [66]:
# from sklearn.preprocessing import LabelEncoder

# # Label Encoding
# le = LabelEncoder()
# for col in ForLabel:
#     df[col] = le.fit_transform(df[col])

# # One-Hot Encoding
# df = pd.get_dummies(df, columns=ForOneHot, drop_first=True)

In [67]:
from sklearn.preprocessing import LabelEncoder
import pickle

label_encoders = {}

# Label Encoding
for col in ForLabel:
    le = LabelEncoder()
    le.fit(df[col])  # Fit on training data
    df[col] = le.transform(df[col])  # Transform training
    label_encoders[col] = le  # Save the encoder


In [68]:
# Save label encoders
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [69]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=ForOneHot, drop_first=True)

# Save the final column list (used for reindexing test set later)
final_columns = df.columns.drop(['h1n1_vaccine', 'seasonal_vaccine']).tolist()

In [70]:
# Save the column names for later use in test data preprocessing
with open('final_columns.pkl', 'wb') as f:
    pickle.dump(final_columns, f)

In [71]:
df.columns

Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'sex', 'income_poverty', 'marital_status', 'rent_or_own',
       'household_adults', 'household_children', 'h1n1_vaccine',
       'seasonal_vaccine', 'race_Hispanic', 'race_Other or Multiple',
       'race_White', 'employment_status_Not in Labor Force',
       'employment_status_Unemployed', 'hhs_geo_region_bhuqouqj',
       'hhs_geo_region_dqpwygqj', 'hhs_geo_region_fp

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 92 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   respondent_id                         26707 non-null  int64  
 1   h1n1_concern                          26707 non-null  float64
 2   h1n1_knowledge                        26707 non-null  float64
 3   behavioral_antiviral_meds             26707 non-null  float64
 4   behavioral_avoidance                  26707 non-null  float64
 5   behavioral_face_mask                  26707 non-null  float64
 6   behavioral_wash_hands                 26707 non-null  float64
 7   behavioral_large_gatherings           26707 non-null  float64
 8   behavioral_outside_home               26707 non-null  float64
 9   behavioral_touch_face                 26707 non-null  float64
 10  doctor_recc_h1n1                      26707 non-null  float64
 11  doctor_recc_sea

In [73]:
df.head(2)

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_unknown,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,False,False,False,True,False,False,False,False,False,False
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,False,False,False,False,False,False,True,False,False,False


In [74]:
# drop unwanted columns
df.drop(['respondent_id'], axis=1, inplace=True)

In [24]:
#Define X, y featurs
X = df.drop(['h1n1_vaccine', 'seasonal_vaccine'], axis=1)
y_h1n1 = df['h1n1_vaccine']
y_seasonal = df['seasonal_vaccine']

## Split data for Training and splitting

In [25]:
from sklearn.model_selection import train_test_split

# Split for H1N1 vaccine model
X_train_h1n1, X_test_h1n1, y_train_h1n1, y_test_h1n1 = train_test_split(
    X, y_h1n1, test_size=0.2, random_state=42, stratify=y_h1n1
)

# Split for Seasonal vaccine model
X_train_seasonal, X_test_seasonal, y_train_seasonal, y_test_seasonal = train_test_split(
    X, y_seasonal, test_size=0.2, random_state=42, stratify=y_seasonal
)


In [26]:
# import classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score


### For H1N1 Vaccine

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Define all models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(probability=True),
    "NaiveBayes": GaussianNB()
}

# Dictionary to store results
model_results = {}

# Loop through each model, train, and evaluate
for name, model in models.items():
    model.fit(X_train_h1n1, y_train_h1n1)

    y_pred = model.predict(X_test_h1n1)
    y_proba = model.predict_proba(X_test_h1n1)[:, 1]  # for ROC-AUC

    acc = accuracy_score(y_test_h1n1, y_pred)
    roc_auc = roc_auc_score(y_test_h1n1, y_proba)

    model_results[name] = {
        "Accuracy": acc,
        "ROC-AUC": roc_auc
    }

    print(f"\nModel: {name}")
    print(f"Accuracy: {acc:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(classification_report(y_test_h1n1, y_pred))



Model: LogisticRegression
Accuracy: 0.8461
ROC-AUC: 0.8458
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      4207
           1       0.70      0.48      0.57      1135

    accuracy                           0.85      5342
   macro avg       0.79      0.71      0.74      5342
weighted avg       0.83      0.85      0.83      5342


Model: DecisionTree
Accuracy: 0.7789
ROC-AUC: 0.6776
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      4207
           1       0.48      0.50      0.49      1135

    accuracy                           0.78      5342
   macro avg       0.67      0.68      0.67      5342
weighted avg       0.78      0.78      0.78      5342


Model: RandomForest
Accuracy: 0.8501
ROC-AUC: 0.8431
              precision    recall  f1-score   support

           0       0.86      0.96      0.91      4207
           1       0.76      0.43      0.55      1135

    accuracy  

Parameters: { "use_label_encoder" } are not used.




Model: XGBoost
Accuracy: 0.8491
ROC-AUC: 0.8503
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      4207
           1       0.70      0.52      0.59      1135

    accuracy                           0.85      5342
   macro avg       0.79      0.73      0.75      5342
weighted avg       0.84      0.85      0.84      5342


Model: KNN
Accuracy: 0.8194
ROC-AUC: 0.7418
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      4207
           1       0.62      0.38      0.47      1135

    accuracy                           0.82      5342
   macro avg       0.74      0.66      0.68      5342
weighted avg       0.80      0.82      0.80      5342


Model: SVC
Accuracy: 0.8491
ROC-AUC: 0.8474
              precision    recall  f1-score   support

           0       0.86      0.97      0.91      4207
           1       0.77      0.42      0.54      1135

    accuracy                           0.85

In [None]:
pd.DataFrame(model_results).T.sort_values(by="ROC-AUC", ascending=False)

Unnamed: 0,Accuracy,ROC-AUC
GradientBoosting,0.849869,0.856436
XGBoost,0.84912,0.850346
SVC,0.84912,0.847382
LogisticRegression,0.846125,0.845776
RandomForest,0.850056,0.843132
KNN,0.819356,0.741823
NaiveBayes,0.665481,0.70778
DecisionTree,0.778922,0.677568


---

####  **Comparison Summary (Key Metrics)**

| Model              | Accuracy | ROC-AUC | Recall (class 1) | F1-score (class 1) |
|--------------------|----------|---------|------------------|--------------------|
| **LogisticRegression** | 0.8461   | 0.8458  | 0.48             | 0.57               |
| **DecisionTree**       | 0.7789   | 0.6776  | 0.50             | 0.49               |
| **RandomForest**       | 0.8501   | 0.8431  | 0.43             | 0.55               |
| **GradientBoosting**   | 0.8499   | 0.8564  | 0.47             | 0.57               |
| **XGBoost**            | 0.8491   | 0.8503  | 0.52             | 0.59               |
| **KNN**                | 0.8194   | 0.7418  | 0.38             | 0.47               |
| **SVC**                | 0.8491   | 0.8474  | 0.42             | 0.54               |
| **NaiveBayes**         | 0.6655   | 0.7078  | 0.68             | 0.46               |

---

#### **Top Performing Models (Balanced Performance)**

- **GradientBoosting**  
  - ✅ Highest **ROC-AUC** (`0.8564`) → best discrimination ability
  - ⚠️ Moderate recall (class 1): 0.47

- **XGBoost**  
  - ✅ Strong ROC-AUC (`0.8503`)
  - ✅ Best recall among top models (0.52)
  - 🔁 Balanced precision-recall

---

#### Observation

- Most models show **class imbalance impact**: high performance on class 0 (non-vaccinated), lower on class 1 (vaccinated).
- Naive Bayes has **high recall** (0.68) but poor precision → good if your use case prioritizes catching all positives even at cost of more false positives.

---

#### Hyperparameter Tuning with GridSearchCV

##### Tuning GradientBoosting:

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# Define the model
model_gb = GradientBoostingClassifier()

# Define the parameter grid
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'subsample': [0.8, 0.9, 1.0]
}

# Initialize GridSearchCV
grid_search_gb = GridSearchCV(estimator=model_gb, param_grid=param_grid_gb,
                              cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)

# Fit the model
grid_search_gb.fit(X_train_h1n1, y_train_h1n1)

# Get the best parameters
best_params_gb = grid_search_gb.best_params_

# Print the best parameters
print("Best parameters for Gradient Boosting:", best_params_gb)

# Train the model with the best parameters
best_model_gb = grid_search_gb.best_estimator_

# Predict with the best model
y_pred_gb = best_model_gb.predict(X_test_h1n1)
y_proba_gb = best_model_gb.predict_proba(X_test_h1n1)[:, 1]

# Evaluation
print("Accuracy:", accuracy_score(y_test_h1n1, y_pred_gb))
print("ROC-AUC:", roc_auc_score(y_test_h1n1, y_proba_gb))
print("\nClassification Report:\n", classification_report(y_test_h1n1, y_pred_gb))


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.850617746162486
ROC-AUC: 0.8601489650666134

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.95      0.91      4207
           1       0.71      0.50      0.59      1135

    accuracy                           0.85      5342
   macro avg       0.79      0.72      0.75      5342
weighted avg       0.84      0.85      0.84      5342



In [None]:
import joblib
joblib.dump(best_model_gb, 'best_gradientboosting_model.pkl')

['best_gradientboosting_model.pkl']

##### Tuning XGBoost:

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
model_xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False)

# Define the parameter grid
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 3, 5],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(estimator=model_xgb, param_grid=param_grid_xgb,
                               cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)

# Fit the model
grid_search_xgb.fit(X_train_h1n1, y_train_h1n1)

# Get the best parameters
best_params_xgb = grid_search_xgb.best_params_

# Print the best parameters
print("Best parameters for XGBoost:", best_params_xgb)

# Train the model with the best parameters
best_model_xgb = grid_search_xgb.best_estimator_

# Predict with the best model
y_pred_xgb = best_model_xgb.predict(X_test_h1n1)
y_proba_xgb = best_model_xgb.predict_proba(X_test_h1n1)[:, 1]

# Evaluation
print("Accuracy:", accuracy_score(y_test_h1n1, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test_h1n1, y_proba_xgb))
print("\nClassification Report:\n", classification_report(y_test_h1n1, y_pred_xgb))


Fitting 5 folds for each of 729 candidates, totalling 3645 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBoost: {'colsample_bytree': 0.6, 'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 200}
Accuracy: 0.8496817671284164
ROC-AUC: 0.8599071821769674

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.95      0.91      4207
           1       0.71      0.49      0.58      1135

    accuracy                           0.85      5342
   macro avg       0.79      0.72      0.74      5342
weighted avg       0.84      0.85      0.84      5342



In [None]:
# Save the model
joblib.dump(best_model_xgb, 'xgboost_seasonal_vaccine_model.pkl')

### For Sessional Vaccine

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Define all models
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(probability=True),
    "NaiveBayes": GaussianNB()
}

# Dictionary to store results
model_results = {}

# Loop through each model, train, and evaluate
for name, model in models.items():
    model.fit(X_train_seasonal, y_train_seasonal)

    y_pred = model.predict(X_test_seasonal)
    y_proba = model.predict_proba(X_test_seasonal)[:, 1]  # for ROC-AUC

    acc = accuracy_score(y_test_seasonal, y_pred)
    roc_auc = roc_auc_score(y_test_seasonal, y_proba)

    model_results[name] = {
        "Accuracy": acc,
        "ROC-AUC": roc_auc
    }

    print(f"\nModel: {name}")
    print(f"Accuracy: {acc:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(classification_report(y_test_seasonal, y_pred))
    print("-"*100)



Model: LogisticRegression
Accuracy: 0.7819
ROC-AUC: 0.8543
              precision    recall  f1-score   support

           0       0.78      0.83      0.80      2855
           1       0.79      0.73      0.76      2487

    accuracy                           0.78      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.78      0.78      0.78      5342

----------------------------------------------------------------------------------------------------

Model: DecisionTree
Accuracy: 0.6906
ROC-AUC: 0.6891
              precision    recall  f1-score   support

           0       0.71      0.71      0.71      2855
           1       0.67      0.67      0.67      2487

    accuracy                           0.69      5342
   macro avg       0.69      0.69      0.69      5342
weighted avg       0.69      0.69      0.69      5342

----------------------------------------------------------------------------------------------------

Model: RandomForest
Accuracy:

Parameters: { "use_label_encoder" } are not used.




Model: XGBoost
Accuracy: 0.7793
ROC-AUC: 0.8539
              precision    recall  f1-score   support

           0       0.78      0.82      0.80      2855
           1       0.78      0.74      0.76      2487

    accuracy                           0.78      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.78      0.78      0.78      5342

----------------------------------------------------------------------------------------------------

Model: KNN
Accuracy: 0.7226
ROC-AUC: 0.7810
              precision    recall  f1-score   support

           0       0.73      0.75      0.74      2855
           1       0.71      0.69      0.70      2487

    accuracy                           0.72      5342
   macro avg       0.72      0.72      0.72      5342
weighted avg       0.72      0.72      0.72      5342

----------------------------------------------------------------------------------------------------

Model: SVC
Accuracy: 0.7857
ROC-AUC: 0.8613
     

#### Hyperparameter Tuning with GridSearchCV

##### Tuning GradientBoosting:

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# Define the model
model_gb = GradientBoostingClassifier(random_state=42)

# Define the parameter grid
param_grid_gb = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'subsample': [0.6, 0.8, 1.0]
}

# Initialize GridSearchCV
grid_search_gb = GridSearchCV(estimator=model_gb, param_grid=param_grid_gb,
                              cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)

# Fit to training data
grid_search_gb.fit(X_train_seasonal, y_train_seasonal)

# Best parameters and model
best_gb = grid_search_gb.best_estimator_
print("Best parameters for Gradient Boosting:", grid_search_gb.best_params_)

# Predictions
y_pred = best_gb.predict(X_test_seasonal)
y_proba = best_gb.predict_proba(X_test_seasonal)[:, 1]

# Evaluation
print("Accuracy:", accuracy_score(y_test_seasonal, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_seasonal, y_proba))
print("\nClassification Report:\n", classification_report(y_test_seasonal, y_pred))


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters for Gradient Boosting: {'learning_rate': 0.05, 'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.7918382628229128
ROC-AUC: 0.8649312678115342

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.82      0.81      2855
           1       0.79      0.75      0.77      2487

    accuracy                           0.79      5342
   macro avg       0.79      0.79      0.79      5342
weighted avg       0.79      0.79      0.79      5342



In [None]:
import joblib
joblib.dump(best_gb, 'SessionalVaccine_best_gradientboosting_model.pkl')

['SessionalVaccine_best_gradientboosting_model.pkl']

##### Tunning RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Initialize model
rf = RandomForestClassifier(random_state=42)

# GridSearch with 5-fold cross-validation
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid,
                              cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)

# Fit the model
grid_search_rf.fit(X_train_seasonal, y_train_seasonal)

# Best model
best_rf = grid_search_rf.best_estimator_
print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# Evaluation
y_pred = best_rf.predict(X_test_seasonal)
y_proba = best_rf.predict_proba(X_test_seasonal)[:, 1]

print("Accuracy:", accuracy_score(y_test_seasonal, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_seasonal, y_proba))
print("\nClassification Report:\n", classification_report(y_test_seasonal, y_pred))

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters for Random Forest: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
Accuracy: 0.7850992137776114
ROC-AUC: 0.8579116484528655

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.83      0.80      2855
           1       0.79      0.74      0.76      2487

    accuracy                           0.79      5342
   macro avg       0.79      0.78      0.78      5342
weighted avg       0.79      0.79      0.78      5342



In [None]:
# Save the model
joblib.dump(best_rf, 'random_forest_seasonal_vaccine_model.pkl')

['random_forest_seasonal_vaccine_model.pkl']

##### Tunning Xgboost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5],
    'min_child_weight': [1, 3],
    'gamma': [0, 0.1],
    'colsample_bytree': [0.6, 0.8]
}

# Initialize model
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# GridSearch with 5-fold cross-validation
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid,
                               cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)

# Fit the model
grid_search_xgb.fit(X_train_seasonal, y_train_seasonal)

# Best model
best_xgb = grid_search_xgb.best_estimator_
print("Best parameters for XGBoost:", grid_search_xgb.best_params_)

# Evaluation
y_pred = best_xgb.predict(X_test_seasonal)
y_proba = best_xgb.predict_proba(X_test_seasonal)[:, 1]

print("Accuracy:", accuracy_score(y_test_seasonal, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_seasonal, y_proba))
print("\nClassification Report:\n", classification_report(y_test_seasonal, y_pred))

# Save the model
joblib.dump(best_xgb, 'xgboost_seasonal_vaccine_model.pkl')


Fitting 5 folds for each of 96 candidates, totalling 480 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBoost: {'colsample_bytree': 0.6, 'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 200}
Accuracy: 0.7873455634593786
ROC-AUC: 0.8651668184190012

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.82      0.81      2855
           1       0.79      0.74      0.77      2487

    accuracy                           0.79      5342
   macro avg       0.79      0.78      0.79      5342
weighted avg       0.79      0.79      0.79      5342



['xgboost_seasonal_vaccine_model.pkl']

###Best Model

In [27]:
from sklearn.ensemble import GradientBoostingClassifier
import joblib

# Train Gradient Boosting for H1N1 vaccine
gb_h1n1 = GradientBoostingClassifier(
    learning_rate=0.05,
    max_depth=5,
    min_samples_split=5,
    n_estimators=200,
    subsample=0.8,
    random_state=42
)

gb_h1n1.fit(X_train_h1n1, y_train_h1n1)

# Save model
joblib.dump(gb_h1n1, 'gradientboosting_h1n1_model.pkl')


['gradientboosting_h1n1_model.pkl']

In [28]:
from xgboost import XGBClassifier

# Train XGBoost for Seasonal vaccine
xgb_seasonal = XGBClassifier(
    learning_rate=0.05,
    max_depth=5,
    min_child_weight=3,
    n_estimators=200,
    gamma=0.1,
    colsample_bytree=0.6,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

xgb_seasonal.fit(X_train_seasonal, y_train_seasonal)

# Save model
joblib.dump(xgb_seasonal, 'xgboost_seasonal_vaccine_model.pkl')


Parameters: { "use_label_encoder" } are not used.



['xgboost_seasonal_vaccine_model.pkl']

## Submission

In [75]:
import pandas as pd

# Load test data
test_df = pd.read_csv('/content/test_set_features.csv')

In [76]:
test_df.head(2)

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp


In [77]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  int64  
 1   h1n1_concern                 26623 non-null  float64
 2   h1n1_knowledge               26586 non-null  float64
 3   behavioral_antiviral_meds    26629 non-null  float64
 4   behavioral_avoidance         26495 non-null  float64
 5   behavioral_face_mask         26689 non-null  float64
 6   behavioral_wash_hands        26668 non-null  float64
 7   behavioral_large_gatherings  26636 non-null  float64
 8   behavioral_outside_home      26626 non-null  float64
 9   behavioral_touch_face        26580 non-null  float64
 10  doctor_recc_h1n1             24548 non-null  float64
 11  doctor_recc_seasonal         24548 non-null  float64
 12  chronic_med_condition        25776 non-null  float64
 13  child_under_6_mo

In [78]:
test_df.shape

(26708, 36)

### Data preprocessing

#### Hadle Missing Value

In [79]:
# Check missing values
missing = test_df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)

Unnamed: 0,0
employment_occupation,13426
employment_industry,13275
health_insurance,12228
income_poverty,4497
doctor_recc_seasonal,2160
doctor_recc_h1n1,2160
rent_or_own,2036
employment_status,1471
marital_status,1442
education,1407


In [80]:
test_df['employment_occupation'] = test_df['employment_occupation'].fillna('unknown')
test_df['employment_industry'] = test_df['employment_industry'].fillna('unknown')
test_df['health_insurance'] = test_df['health_insurance'].fillna(2)

In [81]:
columns = [
    "income_poverty", "doctor_recc_seasonal", "doctor_recc_h1n1", "rent_or_own",
    "employment_status", "marital_status", "education", "chronic_med_condition",
    "child_under_6_months", "health_worker", "opinion_seas_sick_from_vacc",
    "opinion_seas_risk", "opinion_seas_vacc_effective", "opinion_h1n1_vacc_effective",
    "opinion_h1n1_risk", "opinion_h1n1_sick_from_vacc", "household_children",
    "household_adults", "behavioral_avoidance", "behavioral_touch_face",
    "h1n1_knowledge", "h1n1_concern", "behavioral_outside_home",
    "behavioral_antiviral_meds", "behavioral_large_gatherings", "behavioral_wash_hands",
    "behavioral_face_mask"]


In [82]:
for col in columns:
    test_df[col] = test_df[col].fillna(test_df[col].mode()[0])

In [83]:
test_df.isnull().sum()

Unnamed: 0,0
respondent_id,0
h1n1_concern,0
h1n1_knowledge,0
behavioral_antiviral_meds,0
behavioral_avoidance,0
behavioral_face_mask,0
behavioral_wash_hands,0
behavioral_large_gatherings,0
behavioral_outside_home,0
behavioral_touch_face,0


#### Encoding

In [84]:
# Load label encoders and final columns
with open('label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

with open('final_columns.pkl', 'rb') as f:
    final_columns = pickle.load(f)

# Apply LabelEncoder to test set (same as training set)
for col in ForLabel:
    le = label_encoders[col]
    test_df[col] = le.transform(test_df[col])  # Apply label encoding

# Apply One-Hot Encoding to the test set
test_df = pd.get_dummies(test_df, columns=ForOneHot, drop_first=True)

# Align the test set columns with the training columns
test_df = test_df.reindex(columns=final_columns, fill_value=0)

### Prediction

In [88]:
# Load your trained models
import joblib
h1n1_model = joblib.load('/content/gradientboosting_h1n1_model.pkl')
seasonal_model = joblib.load('/content/xgboost_seasonal_vaccine_model.pkl')

In [89]:
respondent_ids = test_df['respondent_id']

# Drop respondent_id before predictions
X_test = test_df.drop('respondent_id', axis=1)

In [90]:
# Get prediction probabilities ([:, 1] = prob of class 1)
h1n1_probs = h1n1_model.predict_proba(X_test)[:, 1]
seasonal_probs = seasonal_model.predict_proba(X_test)[:, 1]

In [91]:
# Create submission DataFrame
submission = pd.DataFrame({
    'respondent_id': respondent_ids,
    'h1n1_vaccine': h1n1_probs,
    'seasonal_vaccine': seasonal_probs
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

In [92]:
submission

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.114463,0.195326
1,26708,0.028418,0.040216
2,26709,0.162349,0.654389
3,26710,0.706376,0.898085
4,26711,0.268353,0.520381
...,...,...,...
26703,53410,0.289232,0.463167
26704,53411,0.187437,0.292423
26705,53412,0.134287,0.223433
26706,53413,0.021567,0.363626
