In [20]:
import os

from sklearn.experimental import enable_iterative_imputer

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    mean_squared_error, r2_score, accuracy_score, f1_score, classification_report
)

from sklearn.ensemble import (
    GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, RandomForestRegressor
)

from sklearn.pipeline import Pipeline
from sklearn.linear_model import (
    BayesianRidge, LinearRegression, LogisticRegression
)
from sklearn.decomposition import PCA
from sklearn.preprocessing import (
    StandardScaler, OneHotEncoder, OrdinalEncoder
)
from sklearn.model_selection import train_test_split

In [4]:
df_test = pd.read_csv('/content/dataset_B_testing.csv')

In [6]:
df_test.drop(columns = ['employment_sector'], inplace = True)

# Impute categorical columns using mode
cat_impute_cols = [
    'health_insurance', 'doctor_recc_h1n1', 'rent_or_own', 'employment_status',
    'education', 'marital_status', 'income_poverty',
    'chronic_med_condition', 'child_under_6_months', 'health_worker'
]

for col in cat_impute_cols:
    if col in df_test.columns:
        df_test[col] = df_test[col].fillna(df_test[col].mode()[0])

# Impute numeric/behavioral columns using median
behavioral_cols = [
    'h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
    'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
    'behavioral_large_gatherings', 'behavioral_outside_home',
    'behavioral_touch_face', 'opinion_h1n1_vacc_effective',
    'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
    'household_adults', 'household_children'
]

for col in behavioral_cols:
    if col in df_test.columns:
        df_test[col] = df_test[col].fillna(df_test[col].median())

# all missing values handled
df_test.isnull().sum()[df_test.isnull().sum() > 0]


Unnamed: 0,0


In [11]:
ohe_features = ["sex", "income_poverty", "rent_or_own", "census_msa", "race"]
ord_features = ["age_group", "education", "marital_status", "employment_status"]

# Ordinal encoding order
age_order = ["18 - 34 Years", "35 - 44 Years", "45 - 54 Years", "55 - 64 Years", "65+ Years"]
edu_order = ["< 12 Years", "12 Years", "Some College", "College Graduate"]
marital_order = ["Not Married", "Married"]
emp_order = ["Not in Labor Force", "Unemployed", "Employed"]
ordinal_categories = [age_order, edu_order, marital_order, emp_order]

# Pipelines for categorical encoders
ohe_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

ord_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ord", OrdinalEncoder(
        categories=ordinal_categories,
        handle_unknown="use_encoded_value",
        unknown_value=-1
    ))
])

preprocess = ColumnTransformer(
    transformers=[
        ("ohe", ohe_tf, ohe_features),
        ("ord", ord_tf, ord_features),
    ],
    remainder="passthrough"  # <-- keep all unlisted columns
)

X = df_test.copy()
# y = df_test["h1n1_vaccine"]

X_transformed = preprocess.fit_transform(X)

encoded_names = preprocess.get_feature_names_out()

X_prepared = pd.DataFrame(X_transformed, columns=encoded_names, index=df_test.index)

In [12]:
X_prepared.columns = (
    X_prepared.columns
    .str.lower()
    .str.replace('[^a-z0-9]+', '_', regex = True)
    .str.strip('_')
)

In [13]:
n_comp = 0.8

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components = n_comp, random_state = 42, svd_solver = "auto", whiten = False))
])

X_pca = pipe.fit_transform(X_prepared)


pca_cols = [f"pc{i+1}" for i in range(X_pca.shape[1])]
PCA_df = pd.DataFrame(X_pca, columns = pca_cols, index = X_prepared.index)

explained = pipe.named_steps["pca"].explained_variance_ratio_
cum_explained = np.cumsum(explained)

In [17]:
X_pca_red = PCA_df.drop(["pc21", "pc19", "pc17", "pc13", "pc11", "pc10", "pc1"], axis = 1)

In [19]:
len(X_pca_red.head().columns)

14

In [22]:
def train_and_predict(X, y, model, random_state = 42):

    #Split into train/test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state = random_state
    )

    # Define the model
    model = model(random_state = random_state)

    # Train the model on original data
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    f1 = f1_score(y_test, predictions)

    accuracy = accuracy_score(y_test, predictions)

    if model.__class__.__name__:
        print(f"{model.__class__.__name__} F1 Score:", f1)
        print(f"{model.__class__.__name__} Accuracy:", accuracy)
    elif model.__name__:
        print(f"{model.__name__} F1 Score:", f1)
        print(f"{model.__name__} Accuracy:", accuracy)
    else:
        print(classification_report(y_test, predictions))

    print("\n")
    print(classification_report(y_test, predictions))

    pred_df = pd.Dataframe(predictions, index = True)

    pred_df.to_csv(f"pred_df{model.__class__.__name__}.csv")

    return X_train, X_test, y_train, y_test, predictions, accuracy, f1