In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read in and Explore Data

In [None]:
data = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.drop("Unnamed: 32", axis = 1, inplace = True)

In [None]:
data.describe()

In [None]:
data.isna().sum()

In [None]:
variables = data.columns
variables = variables.drop(["diagnosis", "id"])
for i in variables:
    sns.histplot(x = data[i])
    plt.show()

In [None]:
for i in variables:
    sns.boxplot(x = data["diagnosis"], y = data[i])
    plt.show()

## SVM Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
id_column = data["id"]
data.drop("id", axis = 1, inplace = True)

In [None]:
X = data.drop("diagnosis", axis = 1)
y = data["diagnosis"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state = 42)

In [None]:
print(y_train.value_counts(normalize = True))
print("\n")
print(y_test.value_counts(normalize = True))

In [None]:
pipeline = make_pipeline(Normalizer(norm = "l2"), SVC(random_state = 42))

In [None]:
svc_params = {"svc__kernel": ["poly", "rbf"], "svc__degree": list(range(1, 10)), "svc__C": list(range(1, 200, 2)), "svc__class_weight": [{"B": 1, "M": 12}], "svc__gamma" :["scale", "auto"]}

In [None]:
clf = GridSearchCV(pipeline, param_grid = svc_params, cv = 5, n_jobs = 2)

In [None]:
clf.fit(X_train, y_train)

In [None]:
cv_results_df = pd.DataFrame(clf.cv_results_)
cv_results_df.head()

In [None]:
print( f"""The average rank of the model using the rbf kernel is {cv_results_df[cv_results_df["param_svc__kernel"] == "rbf"]["rank_test_score"].mean()} while the average rank of the model using the poly kernel is {cv_results_df[cv_results_df["param_svc__kernel"] == "poly"]["rank_test_score"].mean()}""")

In [None]:
clf.best_params_

In [None]:
y_predictions = clf.predict(X_test)
y_predictions

In [None]:
accuracy_score(y_test, y_predictions)

In [None]:
print(classification_report(y_test, y_predictions))

In [None]:
plot_confusion_matrix(clf, X_test, y_test)

In [None]:
plot_roc_curve(clf, X_test, y_test)

In [None]:
precision_score(y_test, y_predictions, pos_label = "M")

In [None]:
recall_score(y_test, y_predictions, pos_label = "M")

Overall, the model was able to accurately predict a malignant cancer 100% of the time when the cancer was actually malignant (by looking at recall score using malignant cancer as a positive label). Since a malignant cancer is more dangerous than a benign cancer, this metric is more important. Being able to reduce the number of false negatives is more important in this setting in order to better identify which patients actually have a malignant cancer.