In [None]:
# alpha values used in GridSearch
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
f1_scores = []

# F1 scores for each alpha
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    y_pred_binary = (y_pred >= 0.5).astype(int)
    f1 = f1_score(y_test, y_pred_binary)
    f1_scores.append(f1)

# Plot F1 Score vs Alpha
plt.figure(figsize=(8, 5))
plt.plot(alphas, f1_scores, marker='o', linestyle='-', color='royalblue', linewidth=2)
plt.xscale('log')
plt.title('F1 Score vs Ridge Alpha', fontsize=14)
plt.xlabel('Alpha (log scale)', fontsize=12)
plt.ylabel('F1 Score', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()


In [None]:
# Visualization Style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

In [None]:
# Target Variable Distribution
plt.figure()
sns.countplot(data=df, x="h1n1_vaccine", palette="Set2")
plt.title("H1N1 Vaccine Uptake")
plt.xlabel("Vaccinated (1) vs Not Vaccinated (0)")
plt.ylabel("Count")
plt.show()

In [None]:
# Numeric Feature Distributions
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns.drop(
    ["respondent_id", "h1n1_vaccine"], errors="ignore"
)

df[numeric_cols].hist(figsize=(16, 14), bins=20, color="skyblue", edgecolor="black")
plt.suptitle("Distribution of Numeric Features", fontsize=16)
plt.show()

In [None]:
# Categorical Features vs Vaccination
categorical_cols = df.select_dtypes(include=["object"]).columns

for col in categorical_cols:
    plt.figure(figsize=(8, 5))
    sns.countplot(data=df, x=col, hue="h1n1_vaccine", palette="Set2")
    plt.title(f"Vaccination by {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 10))
corr = df[numeric_cols.union(["h1n1_vaccine"])].corr()
sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap (Numeric Features)")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import f1_score

# --- Compute F1 score for each alpha manually ---
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
f1_scores = []

for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    y_pred_binary = (y_pred >= 0.5).astype(int)
    f1 = f1_score(y_test, y_pred_binary)
    f1_scores.append(f1)

# --- Plot F1 scores ---
plt.figure(figsize=(8, 5))
plt.plot(alphas, f1_scores, marker='o', linestyle='-', color='mediumseagreen')
plt.xscale('log')
plt.title("F1 Score vs Ridge Alpha")
plt.xlabel("Alpha (log scale)")
plt.ylabel("F1 Score")
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# Optional: print the best alpha based on F1
best_alpha_f1 = alphas[np.argmax(f1_scores)]
print(f"Best alpha based on F1 Score: {best_alpha_f1}")
print(f"Highest F1 Score: {max(f1_scores):.4f}")


In [None]:
# BEFORE CLEANING
print("===== BEFORE CLEANING =====")
print(df.info())
print("\nMissing values per column:\n", df.isnull().sum())

# PIE CHART (Before Cleaning)
vaccine_counts = df["h1n1_vaccine"].value_counts()
labels = ["Not Vaccinated (0)", "Vaccinated (1)"]

plt.figure(figsize=(6, 6))
plt.pie(
    vaccine_counts,
    labels=labels,
    autopct="%1.1f%%",
    startangle=90,
    colors=["lightcoral", "lightgreen"],
    explode=(0.05, 0)
)
plt.title("H1N1 Vaccine Uptake (Before Cleaning)")
plt.show()

# --- HISTOGRAM (Before Cleaning) ---
plt.figure(figsize=(8, 5))
sns.histplot(df["h1n1_concern"], bins=10, kde=True, color="skyblue")
plt.title("Distribution of H1N1 Concern (Before Cleaning)")
plt.xlabel("H1N1 Concern")
plt.ylabel("Frequency")
plt.show()

# DATA CLEANING

# Fill numeric NaN with median
for col in df.select_dtypes(include=["float64", "int64"]).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical NaN with mode
for col in df.select_dtypes(include=["object"]).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Encode categorical variables using LabelEncoder
label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Remove outliers using IQR (Interquartile Range)
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns.drop(["respondent_id"], errors="ignore")

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

# AFTER CLEANING

print("\n===== AFTER CLEANING =====")
print(df.info())
print("\nMissing values per column:\n", df.isnull().sum())
print("\nShape after cleaning:", df.shape)

# PIE CHART (After Cleaning)
vaccine_counts_clean = df["h1n1_vaccine"].value_counts()

plt.figure(figsize=(6, 6))
plt.pie(
    vaccine_counts_clean,
    labels=labels,
    autopct="%1.1f%%",
    startangle=90,
    colors=["lightcoral", "lightgreen"],
    explode=(0.05, 0)
)
plt.title("H1N1 Vaccine Uptake (After Cleaning)")
plt.show()

# HISTOGRAM (After Cleaning)
plt.figure(figsize=(8, 5))
sns.histplot(df["h1n1_concern"], bins=10, kde=True, color="mediumseagreen")
plt.title("Distribution of H1N1 Concern (After Cleaning)")
plt.xlabel("H1N1 Concern")
plt.ylabel("Frequency")
plt.show()
