
# SVM on BRFSS Diabetes Data (Imbalanced)

This notebook trains a Support Vector Machine on the imbalanced BRFSS diabetes dataset (`archive/diabetes_binary_health_indicators_BRFSS2015.csv`), performs exploratory data analysis, and tunes hyperparameters with grid search while reweighting classes.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
)

sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", None)



## Load data
We use the non-50/50 split dataset so that the natural class imbalance is preserved.


In [None]:

DATA_PATH = Path("archive/diabetes_binary_health_indicators_BRFSS2015.csv")
df = pd.read_csv(DATA_PATH)

print(f"Shape: {df.shape}")
df.head()



## Exploratory data analysis
Check class balance, missingness, and feature correlations with the diabetes indicator.


In [None]:

# Target distribution
class_counts = df['Diabetes_binary'].value_counts()
class_props = df['Diabetes_binary'].value_counts(normalize=True)
print("Class counts:
", class_counts)
print("
Class proportions:
", class_props)

fig, ax = plt.subplots(figsize=(5, 4))
sns.barplot(x=class_counts.index, y=class_counts.values, palette="crest", ax=ax)
ax.set_title("Diabetes class distribution (imbalanced)")
ax.set_xlabel("Diabetes_binary")
ax.set_ylabel("Count")
plt.show()


In [None]:

# Missing values and basic stats
missing = df.isnull().sum()
print("Missing values per column:
", missing)

summary = df.describe().T
summary[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]


In [None]:

# Correlation of features with target
corr = df.corr(numeric_only=True)['Diabetes_binary'].drop('Diabetes_binary')
plt.figure(figsize=(8, 6))
strong_corr = corr.reindex(corr.abs().sort_values(ascending=False).index)
sns.barplot(x=strong_corr.values[:10], y=strong_corr.index[:10], palette="flare")
plt.title("Top correlations with Diabetes_binary")
plt.xlabel("Pearson r")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

strong_corr.sort_values(key=lambda s: s.abs(), ascending=False).head(10)


In [None]:

# Example feature distributions by class
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.kdeplot(data=df, x='BMI', hue='Diabetes_binary', fill=True, common_norm=False, ax=axes[0])
axes[0].set_title('BMI by diabetes status')

sns.countplot(data=df, x='GenHlth', hue='Diabetes_binary', palette='crest', ax=axes[1])
axes[1].set_title('General health by diabetes status')
axes[1].legend(title='Diabetes_binary')
plt.tight_layout()
plt.show()



## Modeling with class-weighted SVM
We standardize features, apply a linear SVM with `class_weight='balanced'`, and tune the regularization strength using grid search scored by ROC-AUC. Stratified splits preserve the original imbalance.


In [None]:

X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("linear_svc", LinearSVC(class_weight="balanced", max_iter=5000)),
])

param_grid = {
    "linear_svc__C": [0.01, 0.1, 1.0, 10.0],
    "linear_svc__loss": ["squared_hinge"],
    "linear_svc__dual": [False],
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# n_jobs=1 avoids OS semaphore limits in some sandboxed environments
search = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=1,
    verbose=1,
)

search.fit(X_train, y_train)
print("Best params:", search.best_params_)
print("Best CV ROC-AUC:", search.best_score_)


In [None]:

# Evaluate on hold-out test set
best_model = search.best_estimator_
probs = best_model.decision_function(X_test)
preds = best_model.predict(X_test)

metrics = {
    "roc_auc": roc_auc_score(y_test, probs),
    "accuracy": accuracy_score(y_test, preds),
    "precision": precision_score(y_test, preds),
    "recall": recall_score(y_test, preds),
    "f1": f1_score(y_test, preds),
}

print("Test metrics:")
for name, val in metrics.items():
    print(f"{name}: {val:.4f}")

cm = confusion_matrix(y_test, preds)
print("
Confusion matrix:
", cm)
print("
Classification report:
", classification_report(y_test, preds))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
