In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# 1. Load and prepare the dataset for binary classification.
# Load the dataset
df = pd.read_csv('breast-cancer 1.csv')

# Drop the 'id' column as it's not relevant for classification
df = df.drop('id', axis=1)

# Encode the 'diagnosis' column (target variable: M=1, B=0)
label_encoder = LabelEncoder()
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

# Separate features (X) and target (y)
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Class distribution in y_train:")
print(y_train.value_counts())

# 2. Train an SVM with linear and RBF kernel.

# Train SVM with linear kernel
svm_linear = SVC(kernel='linear', random_state=42)
svm_linear.fit(X_train_scaled, y_train)

# Predict and evaluate for linear kernel
y_pred_linear = svm_linear.predict(X_test_scaled)
print("\n--- Initial Linear Kernel SVM Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_linear):.4f}")
print(classification_report(y_test, y_pred_linear))

# Train SVM with RBF kernel
svm_rbf = SVC(kernel='rbf', random_state=42)
svm_rbf.fit(X_train_scaled, y_train)

# Predict and evaluate for RBF kernel
y_pred_rbf = svm_rbf.predict(X_test_scaled)
print("\n--- Initial RBF Kernel SVM Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rbf):.4f}")
print(classification_report(y_test, y_pred_rbf))

# 3. Visualize decision boundary using 2D data.
# Use PCA to reduce dimensionality to 2 for visualization
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train SVM on PCA-transformed data specifically for visualization
svm_linear_pca = SVC(kernel='linear', random_state=42)
svm_linear_pca.fit(X_train_pca, y_train)

svm_rbf_pca = SVC(kernel='rbf', random_state=42)
svm_rbf_pca.fit(X_train_pca, y_train)

# Function to plot decision boundary
def plot_decision_boundary(X, y, model, title, filename):
    plt.figure(figsize=(10, 7))
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.coolwarm)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, edgecolors='k')
    plt.title(title)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.grid(True)
    plt.savefig(filename)
    plt.close() # Close the plot to prevent it from displaying in environments like notebooks

# Plot for linear kernel
plot_decision_boundary(X_train_pca, y_train, svm_linear_pca, 'SVM Linear Kernel Decision Boundary (PCA)', 'svm_linear_boundary.png')
print("\nGenerated svm_linear_boundary.png for linear kernel decision boundary.")

# Plot for RBF kernel
plot_decision_boundary(X_train_pca, y_train, svm_rbf_pca, 'SVM RBF Kernel Decision Boundary (PCA)', 'svm_rbf_boundary.png')
print("Generated svm_rbf_boundary.png for RBF kernel decision boundary.")


# 4. Tune hyperparameters like C and gamma.
# 5. Use cross-validation to evaluate performance.

# Use StratifiedKFold for cross-validation due to small dataset and class imbalance
# n_splits is set to 2 because the minority class in y_train has 2 samples.
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Define the parameter grid for linear SVM
param_grid_linear = {
    'C': [0.1, 1, 10, 100]
}

# Define the parameter grid for RBF SVM
param_grid_rbf = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1]
}

# Grid search for linear SVM
grid_search_linear = GridSearchCV(SVC(kernel='linear', random_state=42), param_grid_linear, cv=cv, scoring='accuracy')
grid_search_linear.fit(X_train_scaled, y_train)

print("\n--- Hyperparameter Tuning Results for Linear SVM ---")
print("Best parameters for Linear SVM:", grid_search_linear.best_params_)
print("Best cross-validation accuracy for Linear SVM:", grid_search_linear.best_score_)

# Grid search for RBF SVM
grid_search_rbf = GridSearchCV(SVC(kernel='rbf', random_state=42), param_grid_rbf, cv=cv, scoring='accuracy')
grid_search_rbf.fit(X_train_scaled, y_train)

print("\n--- Hyperparameter Tuning Results for RBF SVM ---")
print("Best parameters for RBF SVM:", grid_search_rbf.best_params_)
print("Best cross-validation accuracy for RBF SVM:", grid_search_rbf.best_score_)

# Evaluate the best models on the test set
best_svm_linear = grid_search_linear.best_estimator_
y_pred_linear_tuned = best_svm_linear.predict(X_test_scaled)
print("\n--- Tuned Linear Kernel SVM Performance on Test Set ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_linear_tuned):.4f}")
print(classification_report(y_test, y_pred_linear_tuned))

best_svm_rbf = grid_search_rbf.best_estimator_
y_pred_rbf_tuned = best_svm_rbf.predict(X_test_scaled)
print("\n--- Tuned RBF Kernel SVM Performance on Test Set ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rbf_tuned):.4f}")
print(classification_report(y_test, y_pred_rbf_tuned))

Class distribution in y_train:
diagnosis
1    21
0     2
Name: count, dtype: int64

--- Initial Linear Kernel SVM Performance ---
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         5

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6


--- Initial RBF Kernel SVM Performance ---
Accuracy: 0.8333
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.83      1.00      0.91         5

    accuracy                           0.83         6
   macro avg       0.42      0.50      0.45         6
weighted avg       0.69      0.83      0.76         6



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Generated svm_linear_boundary.png for linear kernel decision boundary.
Generated svm_rbf_boundary.png for RBF kernel decision boundary.

--- Hyperparameter Tuning Results for Linear SVM ---
Best parameters for Linear SVM: {'C': 0.1}
Best cross-validation accuracy for Linear SVM: 0.9128787878787878

--- Hyperparameter Tuning Results for RBF SVM ---
Best parameters for RBF SVM: {'C': 0.1, 'gamma': 0.001}
Best cross-validation accuracy for RBF SVM: 0.9128787878787878

--- Tuned Linear Kernel SVM Performance on Test Set ---
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         5

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6


--- Tuned RBF Kernel SVM Performance on Test Set ---
Accuracy: 0.8333
              precision    recall  f1-score   support

   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
