In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Splitting Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

# Modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import plot_tree

In [None]:
cancer = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
cancer

In [None]:
cancer.info()

**Let's check if the cancer is Malignant or Benign**

# Data Cleaning

*Drop Columns*

In [None]:
cancer.drop(columns=['id', 'Unnamed: 32'], inplace = True)

*Missing Value*

In [None]:
cancer.isna().sum()/len(cancer.index)*100

# Final Dataset

In [None]:
cancer

# PreProcessing

### *Define Target Data*
* If the cancer is Benign, it will be 0
* If the cancer is Malignant, it will be 1

In [None]:
cancer['diagnosis'] = np.where(cancer['diagnosis'] == 'M', 1, 0)
cancer['diagnosis'].value_counts()/cancer.shape[0]*100

* Data is imbalanced.

In [None]:
X = cancer.drop('diagnosis', axis = 1)
y = cancer['diagnosis']

In [None]:
robust = RobustScaler()
X_scaled = robust.fit_transform(X)

* In the case of breast cancer, I want to reduce predictions to people who are misdiagnosed, diagnosed as benign, but it turns out to be malignant, that is, the person we predict is not the default (FN). Evaluation metrics used: **Recall**

### *Data Splitting*

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
                                                   stratify = y,
                                                    test_size = 0.3,
                                                   random_state = 3030)

* I use 0.3 as default score for test_size and X.shape for random_state so the data will be devided equally.

# Modeling

**KNeighbors Classifier**

In [None]:
k = range(1,100,2)
testing_accuracy = []
training_accuracy = []
score = 0

for i in k:
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train, y_train)
    
    y_predict_train = knn.predict(X_train)
    training_accuracy.append(accuracy_score(y_train, y_predict_train))
    
    y_predict_test = knn.predict(X_test)
    acc_score = accuracy_score(y_test,y_predict_test)
    testing_accuracy.append(acc_score)
    
    if score < acc_score:
        score = acc_score
        best_k = i

sns.lineplot(k, training_accuracy)
sns.scatterplot(k, training_accuracy)
sns.lineplot(k, testing_accuracy)
sns.scatterplot(k, testing_accuracy)
plt.legend(['training accuracy', 'testing accuracy'])

In [None]:
print('This is the best K for KNeighbors Classifier: ', best_k, '\nAccuracy score is: ', score)

* This model indicates **underfitting** because training accuracy and testing accuracy are both decreases.

**Decision Tree Classifier**

In [None]:
depth = range(1,25)
testing_accuracy = []
training_accuracy = []
score = 0

for i in depth:
    tree = DecisionTreeClassifier(max_depth = i, criterion = 'entropy')
    tree.fit(X_train, y_train)
    
    y_predict_train = tree.predict(X_train)
    training_accuracy.append(accuracy_score(y_train, y_predict_train))
    
    y_predict_test = tree.predict(X_test)
    acc_score = accuracy_score(y_test,y_predict_test)
    testing_accuracy.append(acc_score)
    
    if score < acc_score:
        score = acc_score
        best_depth = i
        
sns.lineplot(depth, training_accuracy)
sns.scatterplot(depth, training_accuracy)
sns.lineplot(depth, testing_accuracy)
sns.scatterplot(depth, testing_accuracy)
plt.legend(['training accuracy', 'testing accuracy'])

In [None]:
print('This is the best depth for Decision Tree Classifier: ', best_depth, '\nAccuracy score is: ', score)

* This model indicates **overfitting** because training accuracy is good and the testing accuracy is decreased.

### *Define Model*

* I use **KNeighbors Classifier** with best K score and **Decision Tree Classifier** with best depth score.

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
tree = DecisionTreeClassifier(max_depth = 3, random_state = 3030)

In [None]:
def model_evaluation(model, metric):
    model_cv = cross_val_score(model, X_train, y_train, cv = StratifiedKFold(n_splits = 5), scoring = metric)
    return model_cv

knn_cv = model_evaluation(knn, 'recall')
tree_cv = model_evaluation(tree, 'recall')

for model in [knn, tree]:
    model.fit(X_train, y_train)

score_cv = [knn_cv.round(5), tree_cv.round(5)]
score_mean = [knn_cv.mean(), tree_cv.mean()]
score_std = [knn_cv.std(), tree_cv.std()]
score_recall_score = [recall_score(y_test, knn.predict(X_test)), 
            recall_score(y_test, tree.predict(X_test))]
method_name = [ 'KNN Classifier', 'Decision Tree Classifier']
cv_summary = pd.DataFrame({
    'method': method_name,
    'cv score': score_cv,
    'mean score': score_mean,
    'std score': score_std,
    'recall score': score_recall_score
})
cv_summary

* From the cross validation and model evaluation processes, I decide to continue with **Decision Tree Classifier** even the score is indicated overfitting. Let's tune the model.

# HyperParam Tuning

In [None]:
tree = DecisionTreeClassifier(max_depth = 3, random_state = 3030)

hyperparam_space = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [3, 5, 7, 9, 11],
    'min_samples_leaf': [3, 9, 13, 15, 17],
    'class_weight': ['list', 'dict', 'balanced'],
    'random_state': [3030]
}

grid = GridSearchCV(
                tree,
                param_grid = hyperparam_space,
                cv = StratifiedKFold(n_splits = 5),
                scoring = 'recall',
                n_jobs = -1)

grid.fit(X_train, y_train)

print('best score', grid.best_score_)
print('best param', grid.best_params_)

# Comparison Between Before & After Tuning

In [None]:
tree.fit(X_train, y_train)
tree_recall = (recall_score(y_test, tree.predict(X_test)))

grid.best_estimator_.fit(X_train, y_train)
grid_recall = (recall_score(y_test, grid.predict(X_test)))

score_list = [tree_recall, grid_recall]
method_name = ['Decision Tree Classifier Before Tuning', 'Decision Tree Classifier After Tuning']
best_summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
best_summary

* This is the comparison between before tuning score and after tuning score using Decision Tree Classifier. **I choose to use Decision Tree Classifier after tuning** score in this section.

# Decision Tree Classifier Plot

In [None]:
plt.figure(figsize=(15,8))
plot_tree(grid.best_estimator_, feature_names = list(X), class_names = ['Benign','Malignant'], filled = True)
plt.title('Tree Plot')
plt.show()

# Feature Importance

In [None]:
importance_table = pd.DataFrame({
    'imp': grid.best_estimator_.feature_importances_
}, index = X.columns)
importance_table.sort_values('imp', ascending = False)

In [None]:
importance_table.sort_values('imp', ascending = True).plot(kind = 'barh', figsize = (15,8))

* The results suggest perhaps 4 of the 30 features as being important to prediction.

# Summary

- In the first step, I did **scaling at X data using Robust Scaler** because I believe there are so many outliers.
- I only use **KNeighbor Classifier (KNN) and the Decision Tree Classifier (Tree)** in this prediction. I try to find the best K score and best depth for each model and see how the training and testing data on both models either.
- From the cross-validation process, the KNN model has the highest score with 0.9 but after model evaluation using recall metric, the **Tree model has the highest score with 0.92**. Even the Tree model **indicated overfitting**, I still choose to use this score to continue the process.
- I decide to get the best parameter for the Tree model by Tuning with the best score of 0.95 which is increasing, then compare the Tree model score before and after tuning. The comparison results prove that the **Tree model after the Tuning process is higher than before with 0.9375**.
- I check again to see the data using the Feature Importance process. Surprisingly, from 30 features (columns), **only 4 features that is important** to prediction.