## Data Preparation

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('Breast_cancer_dataset.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)
features = df.columns[1:]

In [None]:
features

In [None]:
X = df.drop(['diagnosis'], axis=1)
y = df['diagnosis'].map({'M': 1, 'B': 0})

In [None]:
y[0:5]

In [None]:
# Code here: Create X_train, X_test, y_train, y_test , use stratified sampling, random_state = 12345

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Code here: Scale your dataset

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
def get_metrics(y_true, y_pred, y_prob, train=True, show_roc = True):
    sk_acc = accuracy_score(y_pred=y_pred, y_true = y_true)
    sk_prec = precision_score(y_pred=y_pred, y_true = y_true)
    sk_rec = recall_score(y_pred=y_pred, y_true = y_true)
    sk_f1 = f1_score(y_pred=y_pred, y_true = y_true)

    fpr, tpr, thresholds = roc_curve(y_true, y_prob[:, 1])

    # Compute the Area Under the Curve (AUC) for the ROC curve
    roc_auc = auc(fpr, tpr)

    # Compute Youden's J statistic for each threshold
    youden_j = tpr - fpr
    optimal_threshold_index = np.argmax(youden_j)
    optimal_threshold = thresholds[optimal_threshold_index]

    print("------------------------------------------------------------------")
    if train:
        print("Training Set Metrics")
    else:
        print("Test Set Metrics")
    conf_matrix= confusion_matrix(y_true= y_true, y_pred = y_pred)
    
    cm_display = ConfusionMatrixDisplay(confusion_matrix = conf_matrix, display_labels = ["benign", "malignant"])
    plt.figure(figsize= (8,6))
    cm_display.plot()
    plt.show()

    print(f"Accuracy {np.round(sk_acc*100,2)}%, Precision {np.round(sk_prec*100,2)}%, Recall {np.round(sk_rec*100,2)}%, F1 Score {np.round(sk_f1*100,2)}%")

    if train and show_roc:
        print(f"Area Under Curve {roc_auc}, optimal threshold {optimal_threshold}")

        plt.figure(figsize=(4, 3))
        plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.scatter(fpr[optimal_threshold_index], tpr[optimal_threshold_index], color='red', marker='o', label=f'Optimal Threshold = {optimal_threshold:.4f}')
        plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve Training')
        plt.legend(loc='lower right')
        plt.show()

# **Naive Bayes**

In [None]:
# Code here: Import the GaussianNB class from naive_bayes

In [None]:
# Code here: Instantiate your model

In [None]:
# Code here: Fit your model

In [None]:
# Code here: Make Predictions on Training and Test Set
# y_tr_pred, y_tr_prob, y_ts_pred, y_ts_prob

In [None]:
get_metrics(y_pred=y_tr_pred, y_true=y_train, y_prob=y_tr_prob)

In [None]:
get_metrics(y_pred=y_ts_pred, y_true=y_test, y_prob=y_ts_prob, train=False)

# **K Nearest Neighbor**

In [None]:
# Import KNeighborsClassifier class from sklearn

In [None]:
# Instantiate your model as knn_model, set n_neighbors to 3

In [None]:
# Fit your model on Training Set

In [None]:
# Code here: Make Predictions on Training and Test Set
# y_tr_pred, y_tr_prob, y_ts_pred, y_ts_prob

In [None]:
get_metrics(y_true = y_train, y_pred = y_tr_pred, y_prob=y_tr_prob)

In [None]:
get_metrics(y_true = y_test, y_pred = y_ts_pred, y_prob=y_ts_prob, train=False)

In [None]:
# Code here: Set up a dictionary grd_knn and set values for n_neighbors parameter

In [None]:
grid_knn

In [None]:
# Code here: Import GridSearchCV and RepeatedKFold

In [None]:
# Code here: Create Cross Validation cv

In [None]:
# Code here: Instantiate a new KNN Model gs_knn_model

In [None]:
# Code here: Instantiate Grid Search

In [None]:
gs_knn_results = # Code here: Gridsearch.fit

In [None]:
gs_knn_results.best_params_

# **Decision Trees**

In [None]:
# Code here: Import DecisionTreeClassifier from sklearn.tree, instantiate a model by specifying random_state = 12345 and fit the model

In [None]:
# Code here: Make predictions on training and test set as done above

In [None]:
get_metrics(y_true=y_train, y_pred = y_tr_pred, y_prob = y_tr_prob)

In [None]:
get_metrics(y_true=y_test, y_pred = y_ts_pred, y_prob = y_ts_prob, train=False)

In [None]:
# Plot the decision tree
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,3), dpi=300)
plot_tree(dt_model,filled=True,rounded=True, ax=axes, feature_names=features, class_names=['benign','malevolent'])
#plt.show()
plt.savefig('treecancer.png', format='png')
plt.show()

In [None]:
grid_dtree = {}
grid_dtree['max_depth'] = np.arange(2,10, 1)

In [None]:
# Now we do grid search
gs_tree_model = DecisionTreeClassifier(random_state=12345)

In [None]:
gs_tree_search = GridSearchCV(gs_tree_model, grid_dtree, scoring = 'accuracy',cv = cv, n_jobs= -1)

In [None]:
gs_tree_result = gs_tree_search.fit(X_train_sc, y_train)

In [None]:
gs_tree_result.best_params_

# **Support Vector Machines**

In [None]:
# Code here: From Support Vector Machine class (SVM) import SVC

In [None]:
# Code here: Instantiate a model, set probability paramter to true

In [None]:
# Code here: Fit your model

In [None]:
y_tr_pred = svm_model.predict(X_train_sc)
y_tr_prob = svm_model.predict_proba(X_train_sc)

y_ts_pred = svm_model.predict(X_test_sc)
y_ts_prob = svm_model.predict_proba(X_test_sc)

In [None]:
get_metrics(y_true = y_train, y_pred = y_tr_pred, y_prob = y_tr_prob)

In [None]:
get_metrics(y_true = y_test, y_pred = y_ts_pred, y_prob = y_ts_prob, train=False)

In [None]:
# Code here: Homework - Do grid search for parameter C in SVM