# WEEK 5 - support vector machines (SVMs), the kernel trick, and regularization for support vector machines.


In [1]:
!pip install statsmodels


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import statsmodels.api as sm
import networkx as nx

In [3]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
#1
df = pd.read_csv("diabetes_012_health_indicators_BRFSS2015.csv")

#2
df_pima = pd.read_csv("pima_indian_diabetes_dataset.csv") 

# Dataset 1

SVM

In [None]:
# Support Vector Machine for Classification Dataset 1
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# --- 1. Prepare data ---
X = df.drop(columns=['Diabetes_012']) 
y = df['Diabetes_012']

# --- 2. Scale features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 3. Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.02, random_state=42, stratify=y
)

# --- 4. Define SVM model + parameter grid ---
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(
    SVC(probability=True, class_weight='balanced', random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# --- 5. Fit model ---
grid_svm.fit(X_train, y_train)

# --- 6. Evaluate model ---
best_svm = grid_svm.best_estimator_
y_pred = best_svm.predict(X_test)
y_prob = best_svm.predict_proba(X_test)

print(f"Best Parameters: {grid_svm.best_params_}")
print(f"Best CV Accuracy: {grid_svm.best_score_:.3f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")

# For multi-class targets, macro AUC is appropriate
auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
print(f"AUC: {auc:.3f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))


In [23]:
# Support Vector Machine for Classification Dataset 1

from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import numpy as np

# --- 1. Prepare data ---

df_small = df.sample(10000, random_state=42)
X = df_small.drop(columns=['Diabetes_012'])
y = df_small['Diabetes_012']


# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# Randomized search setup
param_dist = {
    'C': [0.1, 1, 10, 50],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

rand_svm = RandomizedSearchCV(
    SVC(class_weight='balanced', probability=True, random_state=42),
    param_distributions=param_dist,
    n_iter=5,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

rand_svm.fit(X_train, y_train)

# Evaluate
best_svm = rand_svm.best_estimator_
y_pred = best_svm.predict(X_test)
y_prob = best_svm.predict_proba(X_test)

print(f"Best Parameters: {rand_svm.best_params_}")
print(f"Best CV Accuracy: {rand_svm.best_score_:.3f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"AUC: {roc_auc_score(y_test, y_prob, multi_class='ovr'):.3f}\n")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 5 candidates, totalling 15 fits


Best Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'C': 1}
Best CV Accuracy: 0.668
Test Accuracy: 0.669
AUC: 0.740

              precision    recall  f1-score   support

         0.0       0.95      0.67      0.79      1692
         1.0       0.03      0.14      0.05        36
         2.0       0.31      0.71      0.43       272

    accuracy                           0.67      2000
   macro avg       0.43      0.51      0.42      2000
weighted avg       0.84      0.67      0.73      2000



# Dataset 2 

In [None]:
# Support Vector Machine for Classification Dataset 2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# --- 1. Prepare data ---
X = df_pima.drop(columns=['Outcome']) 
y = df_pima['Outcome']

# --- 2. Scale features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 3. Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# --- 4. Define SVM model + parameter grid ---
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(
    SVC(probability=True, class_weight='balanced', random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# --- 5. Fit model ---
grid_svm.fit(X_train, y_train)

# --- 6. Evaluate model ---
best_svm = grid_svm.best_estimator_
y_pred = best_svm.predict(X_test)
y_prob = best_svm.predict_proba(X_test)

print(f"Best Parameters: {grid_svm.best_params_}")
print(f"Best CV Accuracy: {grid_svm.best_score_:.3f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")

# For multi-class targets, macro AUC is appropriate
auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
print(f"AUC: {auc:.3f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))


Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best CV Accuracy: 0.761
Test Accuracy: 0.740


ValueError: y should be a 1d array, got an array of shape (154, 2) instead.

In [10]:
# Support Vector Machine for Classification 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# --- 1. Prepare data ---
X = df_pima.drop(columns=['Outcome'])   # Features
y = df_pima['Outcome']                  # Target (binary: 0/1)

# --- 2. Scale features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 3. Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# --- 4. Define SVM model + parameter grid ---
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(
    SVC(probability=True, class_weight='balanced', random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# --- 5. Fit model ---
grid_svm.fit(X_train, y_train)

# --- 6. Evaluate model ---
best_svm = grid_svm.best_estimator_
y_pred = best_svm.predict(X_test)
y_prob = best_svm.predict_proba(X_test)[:, 1]  # only the positive class probs

print(f"Best Parameters: {grid_svm.best_params_}")
print(f"Best CV Accuracy: {grid_svm.best_score_:.3f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")

auc = roc_auc_score(y_test, y_prob)
print(f"AUC: {auc:.3f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))


Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best CV Accuracy: 0.761
Test Accuracy: 0.740
AUC: 0.825

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.76      0.79       100
           1       0.61      0.70      0.66        54

    accuracy                           0.74       154
   macro avg       0.72      0.73      0.72       154
weighted avg       0.75      0.74      0.74       154



In [11]:
# Support Vector Machine for Classification 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# --- 1. Prepare data ---
X = df_pima.drop(columns=['Outcome'])   # Features
y = df_pima['Outcome']                  # Target (binary: 0/1)

# --- 2. Scale features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 3. Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, random_state=42, stratify=y
)

# --- 4. Define SVM model + parameter grid ---
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(
    SVC(probability=True, class_weight='balanced', random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# --- 5. Fit model ---
grid_svm.fit(X_train, y_train)

# --- 6. Evaluate model ---
best_svm = grid_svm.best_estimator_
y_pred = best_svm.predict(X_test)
y_prob = best_svm.predict_proba(X_test)[:, 1]  # only the positive class probs

print(f"Best Parameters: {grid_svm.best_params_}")
print(f"Best CV Accuracy: {grid_svm.best_score_:.3f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")

auc = roc_auc_score(y_test, y_prob)
print(f"AUC: {auc:.3f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))


Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best CV Accuracy: 0.754
Test Accuracy: 0.755
AUC: 0.832

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.79      0.81       125
           1       0.64      0.69      0.66        67

    accuracy                           0.76       192
   macro avg       0.73      0.74      0.74       192
weighted avg       0.76      0.76      0.76       192



# WEEK 6 - Decision Tree and Random Forest

In [21]:
# Random Forest - Dataset 1 

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# --- 1. Prepare data ---
X = df.drop(columns=['Diabetes_012'])   
y = df['Diabetes_012']

# --- 2. Split data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.02, stratify=y, random_state=42
)

# --- 3. Define model + parameter distributions ---
rf = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)

param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [5, 10, 20, None],
    'max_features': ['sqrt', 'log2']
}

# --- 4. Randomized Search ---
rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=5,              
    cv=3,                   
    scoring='accuracy',
    n_jobs=-1,              
    verbose=1,
    random_state=42
)

# --- 5. Fit model ---
rf_search.fit(X_train, y_train)

# --- 6. Evaluate best model ---
best_rf = rf_search.best_estimator_
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)

print(f"Best Parameters: {rf_search.best_params_}")
print(f"Best CV Accuracy: {rf_search.best_score_:.3f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")

# Compute multi-class AUC if needed
if len(np.unique(y)) > 2:
    auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
else:
    auc = roc_auc_score(y_test, y_prob[:, 1])

print(f"AUC: {auc:.3f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 5 candidates, totalling 15 fits




Best Parameters: {'n_estimators': 200, 'max_features': 'log2', 'max_depth': None}
Best CV Accuracy: 0.839
Test Accuracy: 0.838
AUC: 0.742

Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.97      0.91      4274
         1.0       0.00      0.00      0.00        93
         2.0       0.47      0.17      0.25       707

    accuracy                           0.84      5074
   macro avg       0.44      0.38      0.39      5074
weighted avg       0.79      0.84      0.80      5074



In [15]:
# Random Forest - Dataset 2 

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# --- 1. Prepare data ---
X = df_pima.drop(columns=['Outcome'])  
y = df_pima['Outcome']

# --- 2. Split data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=42
)

# --- 3. Define model + parameter distributions ---
rf = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)

param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [5, 10, 20, None],
    'max_features': ['sqrt', 'log2']
}

# --- 4. Randomized Search ---
rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=15,              
    cv=5,                   
    scoring='accuracy',
    n_jobs=-1,              
    verbose=1,
    random_state=42
)

# --- 5. Fit model ---
rf_search.fit(X_train, y_train)

# --- 6. Evaluate best model ---
best_rf = rf_search.best_estimator_
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)

print(f"Best Parameters: {rf_search.best_params_}")
print(f"Best CV Accuracy: {rf_search.best_score_:.3f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")

# Compute multi-class AUC if needed
if len(np.unique(y)) > 2:
    auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
else:
    auc = roc_auc_score(y_test, y_prob[:, 1])

print(f"AUC: {auc:.3f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best Parameters: {'n_estimators': 200, 'max_features': 'sqrt', 'max_depth': 20}
Best CV Accuracy: 0.767
Test Accuracy: 0.760
AUC: 0.818

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.86      0.82       125
           1       0.68      0.58      0.63        67

    accuracy                           0.76       192
   macro avg       0.74      0.72      0.73       192
weighted avg       0.75      0.76      0.76       192



In [19]:
df_pima["Outcome"].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64