In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Introduction**

I will import required libraries.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor
from sklearn.decomposition import PCA

#warning library
import warnings
warnings.filterwarnings("ignore")

I will implement dataset to notebook.

In [None]:
df = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")

In [None]:
df.head(10)

As you can see, "id" and "Unnamed" columns are not related with our work. So they should be dropped.

In [None]:
df.drop(["id","Unnamed: 32"], inplace = True, axis = 1)
df.head()

Very first column "diagnosis" has confusing name, so I will rename.

In [None]:
df = df.rename(columns = {"diagnosis":"target"})

We can visualize how much bening or melignant cell that dataset has.

In [None]:
sns.countplot(df["target"])
print(df.target.value_counts())

We can convert our "target" column to binary form, so we can easily apply algorithm.

In [None]:
df["target"] = [1 if i.strip() == "M" else 0 for i in df.target] #Strip function can delete unwanted space char (" ") from data.

In [None]:
print("Data shape: ", df.shape)

In [None]:
df.info() #shortcut for observing null data

In [None]:
df.describe()

# **EDA**

Let's take a close look at relations between features.

In [None]:
corr_matrix = df.corr()
corr_matrix

Now we can gain more insight about correlation thanks to better visualization.

In [None]:
sns.clustermap(corr_matrix, annot = True, fmt = ".2f", figsize = (20,20)) # annot stands for showing numeric values in graph
plt.title("Correlation Between Features")

I want to see only over .75 threshold of correlation.

In [None]:
threshold = 0.75
filter = np.abs(corr_matrix["target"]) > threshold  #relation between target and the others
filter

In [None]:
corr_features = corr_matrix.columns[filter].tolist()
sns.clustermap(df[corr_features].corr(), annot = True, fmt =".2f")
plt.title("Correlation Between Features w/ Corr Threshold 0.75")

In [None]:
sns.pairplot(df[corr_features], diag_kind = "kde", markers = "o", hue = "target")
plt.show()

If there is a skewness, we should handle it, change them to Gaussian. So, we have skewness in some graphs such as third orange one, we will use outlier detection to fix.

# **Outlier Detection**

I will separate features and target to x and y from DataFrame.

In [None]:
y = df["target"]
x = df.drop(["target"], axis = 1)
column_names = x.columns.tolist()

I will use Local Outlier Factor method of Density Based Outlie Detection System.

In [None]:
clf = LocalOutlierFactor()
y_outlier_pred = clf.fit_predict(x)

In outlier score calculation, minus signed score means that point is outlier, otherwise it is inlier.

In [None]:
outlier_score = pd.DataFrame()
outlier_score["score"] = clf.negative_outlier_factor_

For better understanding, I am going to visualize data in 2D graph with outlier scores. Outlier scores will be shown as red circle. However, we notice that some circles are wide, even some points appear close. The reason is in other features, they have close relations. Moreover I will use filter for detecting outliers thanks to -2.2 threshold.

In [None]:
threshold = -2.2
filter = outlier_score["score"] < threshold
outlier_index = outlier_score[filter].index.tolist()

plt.figure(figsize=(10,5))
plt.scatter(x.iloc[outlier_index,3], x.iloc[outlier_index,5], color = "blue", s = 50, label = "Outliers")
plt.scatter(x.iloc[:,3], x.iloc[:,5], color = "k", s = 3, label = "Data Points")

radius = (clf.negative_outlier_factor_.max() - clf.negative_outlier_factor_)/(clf.negative_outlier_factor_.max() - clf.negative_outlier_factor_.min())
outlier_score["radius"] = radius
plt.scatter(x.iloc[:,3], x.iloc[:,5], s = 1000*radius, edgecolor = "r", facecolors = "none", label = "Outlier Score")
plt.legend()
plt.show

Now it is time to dropping outliers.

In [None]:
x = x.drop(outlier_index)
y = y.drop(outlier_index).values

# **Train-Test Split**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 42)

# **Standardization**

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test) # We have already scaled according to x_train, so we dont need to scale again for x_test!

In [None]:
x_train_df = pd.DataFrame(x_train, columns = column_names)
x_train_df["target"] = y_train

Box plot supplies a detailed informations about how features have effects on being benign or melignant, where are the outliers and so on. 

In [None]:
df_melted = pd.melt ( x_train_df, id_vars = "target",
             var_name = "features",
             value_name = "value")
plt.figure(figsize=(10,5))
sns.boxplot(x = "features", y = "value", hue = "target" , data = df_melted)
plt.xticks(rotation = 90)
plt.show()

# **KNN**

In [None]:
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
score = knn.score(x_test, y_test)
print("Score: ", score)
print("CM: ", cm)
print("Basic KNN Acc: ", acc)

optimumKValue method finds best K value for KNN.

In [None]:
def optimumKValue(x_train,x_test,y_train,y_test):
    
    k_range = list(range(1,31))
    weight_options = ["uniform","distance"]
    param_grid = dict(n_neighbors = k_range, weights = weight_options)
    
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn, param_grid, cv = 10, scoring = "accuracy")
    grid.fit(x_train, y_train)
    
    print("Best training score: {} with parameters: {}".format(grid.best_score_, grid.best_params_))
    
    knn = KNeighborsClassifier(**grid.best_params_)
    knn.fit(x_train, y_train)
    
    y_pred_test = knn.predict(x_test)
    y_pred_train = knn.predict(x_train)
    
    cm_test = confusion_matrix(y_test, y_pred_test)
    cm_train = confusion_matrix(y_train, y_pred_train)
    
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    print("Test Score: {}, Train Score: {}".format(acc_test, acc_train))
    print()
    print("CM Test: ", cm_test)
    print("CM Train: ", cm_train)
    
    return grid

grid = optimumKValue(x_train,x_test,y_train,y_test)
    
    

Our train and test scores are so close to one and train score is higher than test score, that situation can be explained as overfitting. 

# **PCA**

PCA (Princible Component Analysis), helps to reduce the dimensionality of dataset. (reduction of features)

Standardization

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

Implementation of PCA

In [None]:
pca = PCA(n_components = 2) #n_components, desired number of components 
pca.fit(x_scaled)
x_reduced_pca = pca.transform(x_scaled)
pca_data = pd.DataFrame(x_reduced_pca, columns = ["p1","p2"])
pca_data["target"] = y
sns.scatterplot(x = "p1", y = "p2", hue = "target", data = pca_data)
plt.title("PCA: p1 vs p2")

Train-test split of x_reduced_pca

In [None]:
x_train_pca, x_test_pca, y_train_pca, y_test_pca = train_test_split(x_reduced_pca, y, test_size = 0.33, random_state = 42)

Get the best K value 

In [None]:
grid_pca = optimumKValue(x_train_pca, x_test_pca, y_train_pca, y_test_pca)

We can visualize each grid's class as a map and training points' classes. Map helps us to understand which training points is classified incorrectly.

In [None]:
#visualize
cmap_light = ListedColormap(["orange","cornflowerblue"])
cmap_bold = ListedColormap(["darkorange","darkblue"])

h = 0.05 #step size in the mesh
x = x_reduced_pca
x_min, x_max = x[:,0].min() - 1, x[:,0].max() + 1
y_min, y_max = x[:,1].min() - 1, x[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

z = grid_pca.predict(np.c_[xx.ravel(), yy.ravel()])

#plot results
z = z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, z, cmap = cmap_light)

#plot training points
plt.scatter(x[:,0], x[:,1], c = y, cmap = cmap_bold, edgecolor = "k", s = 20)
plt.xlim(xx.min(), xx.max()) #axis' size
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights '%s')" % (len(np.unique(y)), grid_pca.best_estimator_.n_neighbors, grid_pca.best_estimator_.weights))

# **NCA**

Neighborhood Components Analysis is a supervised learning algorithm and similar with KNN.

In [None]:
nca = NeighborhoodComponentsAnalysis(n_components = 2, random_state = 42)
nca.fit(x_scaled, y)
x_reduced_nca = nca.transform(x_scaled)
nca_data = pd.DataFrame(x_reduced_nca, columns = ["p1","p2"])
nca_data["target"] = y
sns.scatterplot(x = "p1", y = "p2", hue = "target", data = nca_data)
plt.title("NCA: p1 vs p2")

Train-test split

In [None]:
x_train_nca, x_test_nca, y_train_nca, y_test_nca = train_test_split(x_reduced_nca, y, test_size = 0.33, random_state = 42)

In [None]:
grid_nca = optimumKValue(x_train_nca, x_test_nca, y_train_nca, y_test_nca)

In [None]:
#visualize
cmap_light = ListedColormap(["orange","cornflowerblue"])
cmap_bold = ListedColormap(["darkorange","darkblue"])

h = 0.1 #step size in the mesh
x = x_reduced_nca
x_min, x_max = x[:,0].min() - 1, x[:,0].max() + 1
y_min, y_max = x[:,1].min() - 1, x[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

z = grid_nca.predict(np.c_[xx.ravel(), yy.ravel()])

#plot results
z = z.reshape(xx.shape)
plt.figure(figsize=(15,10))
plt.pcolormesh(xx, yy, z, cmap = cmap_light)

#plot training points
plt.scatter(x[:,0], x[:,1], c = y, cmap = cmap_bold, edgecolor = "k", s = 20)
plt.xlim(xx.min(), xx.max()) #axis' size
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights '%s')" % (len(np.unique(y)), grid_nca.best_estimator_.n_neighbors, grid_nca.best_estimator_.weights))

As a result; I get approximately 98% accuracy with 4 mistakes out of test 183 samples.