In [None]:
import os, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from sklearn.model_selection import train_test_split, learning_curve
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    RocCurveDisplay, PrecisionRecallDisplay, roc_auc_score
)
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


from google.colab import drive
drive.mount('/content/drive')

CSV_PATH = "/content/drive/MyDrive/ML/Breast_Cancer_Diagnostic.csv"
df = pd.read_csv(CSV_PATH)
display(df.head())
display(df.info())
display(df.describe())
display(df.isnull().sum())

X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=40, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model3 = SVC(kernel="linear", C=30, gamma="auto", probability=True, random_state=42)
model3.fit(X_train_scaled, y_train)
svm_score = model3.score(X_test_scaled, y_test)
print("SVM Accuracy:", svm_score)

y_pred = model3.predict(X_test_scaled)
y_proba = model3.predict_proba(X_test_scaled)[:,1]

print("\n Classification Report (SVM):")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=model3.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model3.classes_)
disp.plot(cmap="Greens")
plt.title("SVM - Confusion Matrix")
plt.show()

RocCurveDisplay.from_estimator(model3, X_test_scaled, y_test)
plt.title("SVM - ROC Curve")
plt.show()

auc = roc_auc_score((y_test == sorted(y.unique())[-1]).astype(int), y_proba)
print(" ROC-AUC Score:", auc)

PrecisionRecallDisplay.from_estimator(model3, X_test_scaled, y_test)
plt.title("SVM - Precision-Recall Curve")
plt.show()

# Permutation Feature Importance
r = permutation_importance(model3, X_test_scaled, y_test, n_repeats=15, random_state=42)
feat_imp = pd.Series(r.importances_mean, index=X.columns).sort_values(ascending=False)[:12]

plt.figure(figsize=(8,6))
sns.barplot(x=feat_imp.values, y=feat_imp.index, palette="viridis")
plt.title("Top Features (Permutation Importance - SVM)")
plt.xlabel("Importance Score")
plt.show()

#PCA 2D projection with decision boundary
pca = PCA(n_components=2, random_state=42)
X_proj = pca.fit_transform(np.vstack((X_train_scaled, X_test_scaled)))
y_all = np.concatenate((y_train, y_test))

# Refit SVM on projected 2D data
svm2d = SVC(kernel="linear", C=30, gamma="auto")
svm2d.fit(X_proj[:len(y_train)], y_train)

# Plot decision boundary
xx, yy = np.meshgrid(
    np.linspace(X_proj[:,0].min()-1, X_proj[:,0].max()+1, 300),
    np.linspace(X_proj[:,1].min()-1, X_proj[:,1].max()+1, 300)
)
Z = svm2d.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)


Z_numeric = np.where(Z == 'M', 1, 0)

plt.figure(figsize=(8,6))
plt.contourf(xx, yy, Z_numeric, alpha=0.3, cmap="coolwarm")
sns.scatterplot(x=X_proj[:,0], y=X_proj[:,1], hue=y_all, palette="Set1", s=50, alpha=0.8)
plt.title("SVM Decision Boundary (PCA 2D projection)")
plt.show()

#t-SNE Visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate="auto")
X_tsne = tsne.fit_transform(np.vstack((X_train_scaled, X_test_scaled)))

plt.figure(figsize=(8,6))
sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=y_all, palette="Set2", s=50, alpha=0.8)
plt.title("t-SNE (2D) Clustering of Classes")
plt.show()

# Learning Curve
train_sizes, train_scores, val_scores = learning_curve(
    model3, X_train_scaled, y_train,
    train_sizes=np.linspace(0.1, 1.0, 6), cv=5, scoring="accuracy", n_jobs=-1
)
train_mean, val_mean = train_scores.mean(axis=1), val_scores.mean(axis=1)

plt.figure(figsize=(8,6))
plt.plot(train_sizes, train_mean, marker='o', label="Training Accuracy")
plt.plot(train_sizes, val_mean, marker='o', label="Validation Accuracy")
plt.xlabel("Training Samples")
plt.ylabel("Accuracy")
plt.title("Learning Curve - SVM")
plt.legend()
plt.grid(True)
plt.show()

# ====== SHAP explanations for SVM (model-agnostic)
!pip -q install shap
import shap, numpy as np, pandas as pd
shap.initjs()

classes = np.array(model3.classes_)
pos_label = classes[-1]

rng = np.random.RandomState(42)
bg_idx = rng.choice(X_train_scaled.shape[0], size=min(200, X_train_scaled.shape[0]), replace=False)
X_bg = X_train_scaled[bg_idx]

test_idx = rng.choice(X_test_scaled.shape[0], size=min(100, X_test_scaled.shape[0]), replace=False)
X_te_small = X_test_scaled[test_idx]
y_te_small = np.array(y_test)[test_idx]

pos_col = np.where(classes == pos_label)[0][0]
f = lambda data: model3.predict_proba(data)[:, pos_col]

explainer = shap.KernelExplainer(f, X_bg)
shap_values = explainer.shap_values(X_te_small, nsamples="auto")

feat_names = list(X.columns)
X_te_df = pd.DataFrame(X_te_small, columns=feat_names)


shap.summary_plot(shap_values, X_te_df, plot_type="bar", show=True)


shap.summary_plot(shap_values, X_te_df, show=True)

shap.force_plot(explainer.expected_value, shap_values[0,:], X_te_df.iloc[0,:], matplotlib=True)
plt.title(f"SHAP Force Plot — SVM, true label: {y_te_small[0]}")
plt.show()