**Import Libraries**

In [191]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge, LinearRegression, Ridge, RidgeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_squared_error, r2_score, f1_score, accuracy_score, classification_report
)
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.stats import loguniform

**Load and Inspect Data**

In [192]:
df= pd.read_csv('/content/dataset_B_training.csv')

In [193]:
df.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,employment_sector,h1n1_vaccine
0,1,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,Female,"<= $75,000, Above Poverty",Married,Own,Employed,"MSA, Not Principle City",2.0,1.0,construction,0
1,2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Female,Below Poverty,Not Married,Own,Employed,Non-MSA,0.0,3.0,wholesale,0
2,3,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,Female,"> $75,000",Not Married,Own,Employed,"MSA, Principle City",0.0,0.0,real_estate,1
3,4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,Female,"<= $75,000, Above Poverty",Not Married,Rent,Not in Labor Force,Non-MSA,0.0,0.0,,0
4,5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Female,,Not Married,,Unemployed,Non-MSA,3.0,0.0,,0


In [194]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4756 entries, 0 to 4755
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                4756 non-null   int64  
 1   h1n1_concern                 4741 non-null   float64
 2   h1n1_knowledge               4734 non-null   float64
 3   behavioral_antiviral_meds    4739 non-null   float64
 4   behavioral_avoidance         4729 non-null   float64
 5   behavioral_face_mask         4752 non-null   float64
 6   behavioral_wash_hands        4748 non-null   float64
 7   behavioral_large_gatherings  4747 non-null   float64
 8   behavioral_outside_home      4741 non-null   float64
 9   behavioral_touch_face        4736 non-null   float64
 10  doctor_recc_h1n1             4437 non-null   float64
 11  chronic_med_condition        4595 non-null   float64
 12  child_under_6_months         4622 non-null   float64
 13  health_worker     

In [195]:
df.shape

(4756, 31)

In [196]:
df.columns

Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'chronic_med_condition',
       'child_under_6_months', 'health_worker', 'health_insurance',
       'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'age_group', 'education', 'race', 'sex',
       'income_poverty', 'marital_status', 'rent_or_own', 'employment_status',
       'census_msa', 'household_adults', 'household_children',
       'employment_sector', 'h1n1_vaccine'],
      dtype='object')

In [197]:
df.describe()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,household_adults,household_children,h1n1_vaccine
count,4756.0,4741.0,4734.0,4739.0,4729.0,4752.0,4748.0,4747.0,4741.0,4736.0,...,4595.0,4622.0,4625.0,2831.0,4687.0,4692.0,4692.0,4722.0,4722.0,4756.0
mean,2378.5,1.658933,1.30714,0.057396,0.74096,0.077652,0.842039,0.359385,0.341278,0.704814,...,0.314037,0.090437,0.141189,0.901801,3.965436,2.512361,2.412191,0.888818,0.542143,0.394029
std,1373.08327,0.892478,0.612811,0.232622,0.438154,0.267651,0.364743,0.479871,0.474189,0.456174,...,0.464182,0.286838,0.348254,0.297635,0.981737,1.336426,1.372509,0.755777,0.931243,0.488692
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,1189.75,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,4.0,1.0,1.0,0.0,0.0,0.0
50%,2378.5,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,4.0,2.0,2.0,1.0,0.0,0.0
75%,3567.25,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,5.0,4.0,4.0,1.0,1.0,1.0
max,4756.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,5.0,5.0,5.0,3.0,3.0,1.0


In [198]:
# Quick overview
print("\nData Types:")
print(df.dtypes)


Data Types:
respondent_id                    int64
h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                             object
income_poverty                  object
marital_status                  object
rent_or_own                     object
employment_s

In [199]:
df.nunique()

Unnamed: 0,0
respondent_id,4756
h1n1_concern,4
h1n1_knowledge,3
behavioral_antiviral_meds,2
behavioral_avoidance,2
behavioral_face_mask,2
behavioral_wash_hands,2
behavioral_large_gatherings,2
behavioral_outside_home,2
behavioral_touch_face,2


In [200]:
df.isnull().sum()

Unnamed: 0,0
respondent_id,0
h1n1_concern,15
h1n1_knowledge,22
behavioral_antiviral_meds,17
behavioral_avoidance,27
behavioral_face_mask,4
behavioral_wash_hands,8
behavioral_large_gatherings,9
behavioral_outside_home,15
behavioral_touch_face,20


In [201]:
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_percent.sort_values(ascending=False)


Unnamed: 0,0
employment_sector,49.705635
health_insurance,40.475189
income_poverty,14.970563
doctor_recc_h1n1,6.707317
rent_or_own,6.623213
employment_status,4.941127
marital_status,4.751892
education,4.70984
chronic_med_condition,3.385198
child_under_6_months,2.817494


**Data Cleaning and Imputation**


In [202]:

df.drop(columns=['employment_sector'], inplace=True)

# Impute categorical columns using mode
cat_impute_cols = [
    'health_insurance', 'doctor_recc_h1n1', 'rent_or_own', 'employment_status',
    'education', 'marital_status', 'income_poverty',
    'chronic_med_condition', 'child_under_6_months', 'health_worker'
]

for col in cat_impute_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

# Impute numeric/behavioral columns using median
behavioral_cols = [
    'h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
    'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
    'behavioral_large_gatherings', 'behavioral_outside_home',
    'behavioral_touch_face', 'opinion_h1n1_vacc_effective',
    'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
    'household_adults', 'household_children'
]

for col in behavioral_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# all missing values handled
df.isnull().sum()[df.isnull().sum() > 0]


Unnamed: 0,0


In [203]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols


['age_group',
 'education',
 'race',
 'sex',
 'income_poverty',
 'marital_status',
 'rent_or_own',
 'employment_status',
 'census_msa']

**Encoding Categorical Variables**

In [204]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("Shape after encoding:", df_encoded.shape)
df_encoded.head()


Shape after encoding: (4756, 40)


Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,race_White,sex_Male,"income_poverty_> $75,000",income_poverty_Below Poverty,marital_status_Not Married,rent_or_own_Rent,employment_status_Not in Labor Force,employment_status_Unemployed,"census_msa_MSA, Principle City",census_msa_Non-MSA
0,1,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,True,False,False,False,False,False,False,False,False,False
1,2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,True,True,False,False,False,False,True
2,3,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,True,False,True,False,True,False,False,False,True,False
3,4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,True,False,False,False,True,True,True,False,False,True
4,5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,True,False,False,True,False,True


**Feature Scaling**

In [205]:
from sklearn.preprocessing import StandardScaler

# Scale only numeric features
num_cols = df_encoded.select_dtypes(include=['float64', 'int64']).columns.drop('h1n1_vaccine')

scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])


In [206]:
print("Final Shape:", df_encoded.shape)
print("Any Missing Values?:", df_encoded.isnull().sum().sum())
df_encoded.head()

df.isnull().sum()


Final Shape: (4756, 40)
Any Missing Values?: 0


Unnamed: 0,0
respondent_id,0
h1n1_concern,0
h1n1_knowledge,0
behavioral_antiviral_meds,0
behavioral_avoidance,0
behavioral_face_mask,0
behavioral_wash_hands,0
behavioral_large_gatherings,0
behavioral_outside_home,0
behavioral_touch_face,0


**Train-Test Split**

In [207]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df_encoded.drop('h1n1_vaccine', axis=1)
y = df_encoded['h1n1_vaccine']

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (3804, 39)
Test shape: (952, 39)


**Ridge Classifier - Base**

In [208]:
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Initialize Ridge Classifier
ridge_clf = RidgeClassifier(alpha=1.0)

# Train
ridge_clf.fit(X_train, y_train)

# Predict
y_pred_ridge = ridge_clf.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred_ridge)
f1 = f1_score(y_test, y_pred_ridge)

print("Ridge Classifier Accuracy:", acc)
print("Ridge Classifier F1 Score:", f1)
print("\nClassification Report:\n", classification_report(y_test, y_pred_ridge))


Ridge Classifier Accuracy: 0.7489495798319328
Ridge Classifier F1 Score: 0.6571018651362984

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.84      0.80       577
           1       0.71      0.61      0.66       375

    accuracy                           0.75       952
   macro avg       0.74      0.72      0.73       952
weighted avg       0.75      0.75      0.74       952



**Ridge Classifier (Hyperparameter Tuning)**

In [209]:
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
from scipy.stats import loguniform

# Base model
ridge_clf = RidgeClassifier(random_state=42)

# Hyperparameter distributions
param_distributions = {
    'alpha': loguniform(1e-3, 1e2),   # continuous log-uniform distribution between 0.001 and 100
    'solver': ['auto', 'saga', 'lsqr']  # categorical options
}

# Randomized search
random_search_ridge = RandomizedSearchCV(
    estimator=ridge_clf,
    param_distributions=param_distributions,
    n_iter=20,            # number of random combinations to try (adjust as needed)
    cv=5,
    scoring='f1',         # optimize for F1 score
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit
random_search_ridge.fit(X_train, y_train)

# Best model
best_ridge_clf = random_search_ridge.best_estimator_
print("\n Best Ridge Classifier Parameters:\n", random_search_ridge.best_params_)

# Predictions
y_pred_ridge_best = best_ridge_clf.predict(X_test)

# Evaluate
acc_ridge_best = accuracy_score(y_test, y_pred_ridge_best)
f1_ridge_best = f1_score(y_test, y_pred_ridge_best)

print("\n Ridge Classifier (After Random Search Tuning)")
print(f"Accuracy: {acc_ridge_best:.4f}")
print(f"F1 Score: {f1_ridge_best:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_ridge_best))


Fitting 5 folds for each of 20 candidates, totalling 100 fits

 Best Ridge Classifier Parameters:
 {'alpha': np.float64(0.0060252157362038605), 'solver': 'lsqr'}

 Ridge Classifier (After Random Search Tuning)
Accuracy: 0.7500
F1 Score: 0.6590

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.84      0.80       577
           1       0.71      0.61      0.66       375

    accuracy                           0.75       952
   macro avg       0.74      0.73      0.73       952
weighted avg       0.75      0.75      0.75       952



**RandomForestClassifier - Base Model**

In [210]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Train model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf.predict(X_test)

# Evaluation
acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print("Random Forest Accuracy:", acc_rf)
print("Random Forest F1 Score:", f1_rf)
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.740546218487395
Random Forest F1 Score: 0.6446043165467625

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.83      0.80       577
           1       0.70      0.60      0.64       375

    accuracy                           0.74       952
   macro avg       0.73      0.72      0.72       952
weighted avg       0.74      0.74      0.74       952



**RandomForestClassifier - Hyperparameter tuning**

In [211]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Base model
rf = RandomForestClassifier(random_state=42)

# Define hyperparameter search space
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Randomized search with 5-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=20,
    scoring='f1',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit search
random_search.fit(X_train, y_train)

# Best model
best_rf = random_search.best_estimator_
print("\n Best Random Forest Parameters:")
print(random_search.best_params_)

# Predictions
y_pred_rf_best = best_rf.predict(X_test)

# Evaluate
acc_rf_best = accuracy_score(y_test, y_pred_rf_best)
f1_rf_best = f1_score(y_test, y_pred_rf_best)

print("\n Random Forest (After Random Search Tuning)")
print(f"Accuracy: {acc_rf_best:.4f}")
print(f"F1 Score: {f1_rf_best:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf_best))

Fitting 5 folds for each of 20 candidates, totalling 100 fits

 Best Random Forest Parameters:
{'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 40}

 Random Forest (After Random Search Tuning)
Accuracy: 0.7468
F1 Score: 0.6492

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.85      0.80       577
           1       0.71      0.59      0.65       375

    accuracy                           0.75       952
   macro avg       0.74      0.72      0.73       952
weighted avg       0.74      0.75      0.74       952



In [218]:
from sklearn.metrics import f1_score
from google.colab import files

# Compute F1 scores for tuned models
f1_ridge = f1_score(y_test, best_ridge_clf.predict(X_test))
f1_rf = f1_score(y_test, best_rf.predict(X_test))

print(f"Ridge Classifier (Tuned) F1 Score: {f1_ridge:.4f}")
print(f"Random Forest (Tuned) F1 Score: {f1_rf:.4f}")

# Load and preprocess test data
test_df = pd.read_csv('/content/dataset_B_testing.csv')
test_ids = test_df['respondent_id']
if 'employment_sector' in test_df.columns:
    test_df.drop(columns=['employment_sector'], inplace=True)

for col in cat_impute_cols:
    if col in test_df.columns:
        test_df[col] = test_df[col].fillna(df[col].mode()[0])
for col in behavioral_cols:
    if col in test_df.columns:
        test_df[col] = test_df[col].fillna(df[col].median())

test_df_encoded = pd.get_dummies(test_df, columns=cat_cols, drop_first=True)
test_df_encoded = test_df_encoded.reindex(columns=X_train.columns, fill_value=0)
test_df_encoded[num_cols] = scaler.transform(test_df_encoded[num_cols])

# Predictions from both models
ridge_preds = best_ridge_clf.predict(test_df_encoded)
rf_preds = best_rf.predict(test_df_encoded)

# Create submission DataFrames
ridge_submission = pd.DataFrame({
    'respondent_id': test_ids,
    'h1n1_vaccine_prediction': ridge_preds
})
rf_submission = pd.DataFrame({
    'respondent_id': test_ids,
    'h1n1_vaccine_prediction': rf_preds
})

# Save and download CSV files
ridge_path = '/content/h1n1_predictions_ridge.csv'
rf_path = '/content/h1n1_predictions_rf.csv'

ridge_submission.to_csv(ridge_path, index=False)
rf_submission.to_csv(rf_path, index=False)

files.download(ridge_path)
files.download(rf_path)


Ridge Classifier (Tuned) F1 Score: 0.6590
Random Forest (Tuned) F1 Score: 0.6492


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>