In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<font color="red">
Content:
    
1. [Attribute Information](#22)
2. [Read the data](#1)
3. [Exploratory Data Analysis](#2)
    * [Heatmap](#23)
    * [Correlation with target](#3)
    * [Box Plot](#4)
    * [Pair Plot](#5)
4. [Outlier Detection](#6)
    * [Local Outlier Factor](#7)
    * [Drop Outliers](#8)
5. [Train - Test Split](#9)   
6. [Standardization](#10)
7. [Modeling](#11)   
    * [KNN](#12)
      * [Best KNN Parameters](#13)
      * [KNN Tuning](#14)
8. [PCA](#15)    
9. [NCA](#16)
10. [Conclusion](#17)    

<a id = "22"></a><br>
# Attribute Information

1) ID number

2) Diagnosis (M = malignant, B = benign)

Ten real-valued features are computed for each cell nucleus:

a) radius (mean of distances from center to points on the perimeter)

b) texture (standard deviation of gray-scale values)

c) perimeter

d) area

e) smoothness (local variation in radius lengths)

f) compactness (perimeter^2 / area - 1.0)

g) concavity (severity of concave portions of the contour)

h) concave points (number of concave portions of the contour)

i) symmetry

j) fractal dimension ("coastline approximation" - 1)


<a id = "1"></a><br>
# Read and check the data

In [None]:
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")
df.head()

In [None]:
#We dont need id and Unnamed:32 columns.
df.drop(["id","Unnamed: 32"],axis=1,inplace=True)

# Change diagnosis to target.
df.rename(columns={"diagnosis":"target"},inplace=True)
df.head()

In [None]:
# Take a quick look to target feature.
df.target.value_counts()

In [None]:
sns.countplot(df.target);

In [None]:
# Binarize m and b
df.target.replace({"M":1,"B":0},inplace=True)
df.target.unique()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

* There is no null data.

In [None]:
df.describe().T

<a id = "2"></a><br>
# Exploratory Data Analysis

<a id = "23"></a><br>
## Heatmap

In [None]:
# Correlation
plt.figure(figsize=(15,6))
sns.heatmap(df.corr(),annot=True,linewidths=0.5,fmt=".2f",cmap="YlOrRd")
plt.title("Correlation Matrix");

* There are some correlated features, we can focus them in feature engineering section.

<a id = "3"></a><br>
## Correlation with target

In [None]:
df.drop('target', axis=1).corrwith(df.target).plot(kind='bar', grid=True, figsize=(12, 8), title="Correlation with target",color="salmon");

* Nearly all features correlated with target except fractral_dimension_mean, texture_se and symmetry_se

In [None]:
#correlation > 0.6
dfcorr = df[["target","radius_mean","perimeter_mean","area_mean","concavity_mean","concave points_mean",
             "radius_worst","perimeter_worst","area_worst","concavity_worst","concave points_worst"]]

<a id = "4"></a><br>
## Box Plot

In [None]:
# We should melt the data to visualize with box plot
df_melted = pd.melt(df,id_vars="target",var_name="features",value_name="value")
plt.figure(figsize=(10,6))
sns.boxplot(x="features",y="value",hue="target",data=df_melted)
plt.xticks(rotation=90)
plt.title("Box plot");

* Box plot tell us nothing, because we should standardize the data first. We will do it later.

<a id = "5"></a><br>
## Pair Plot

In [None]:
sns.pairplot(dfcorr,diag_kind="kde",markers="+",hue="target");

* According to histogram plots we have positive skewness problems.
* We can use log(1-x) transformation for the positive skewness. (I will not do any transformation in this kernel))

<a id = "6"></a><br>
# Outlier Detection

<a id = "7"></a><br>
## Local Outlier Factor

In [None]:
# split the data to X and y before LOF
y=df["target"]
X=df.drop(["target"],axis=1)
columns= df.columns.tolist()

In [None]:
lof= LocalOutlierFactor()
y_pred=lof.fit_predict(X)
y_pred[0:10]
#  1 = inlier
# -1 = outlier

In [None]:
x_score= lof.negative_outlier_factor_
outlier_score= pd.DataFrame()
outlier_score["score"]=x_score

lofthreshold= -2.5
loffilter= outlier_score["score"]< lofthreshold
outlier_index= outlier_score[loffilter].index.tolist()

In [None]:
plt.figure(figsize=(12,6))
plt.scatter(X.iloc[outlier_index,0],X.iloc[outlier_index,4],color="darkblue",s=50,label="outliers")
plt.scatter(X.iloc[:,0],X.iloc[:,4],color="k",s=3,label="Data Points")

radius=(x_score.max()- x_score)/(x_score.max()-x_score.min())
outlier_score["radius"]=radius
plt.scatter(X.iloc[:,0],X.iloc[:,4],s=1000*radius,edgecolors="r",facecolors="none",label="outlier scores")
plt.legend();

* We detected outliers, lets drop them.

<a id = "8"></a><br>
## Drop Outliers

In [None]:
X= X.drop(outlier_index)
y= y.drop(outlier_index).values

<a id = "9"></a><br>
# Train - Test Split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

<a id = "10"></a><br>
# Standardization

In [None]:
# Dont fit the scaler while standardizate X_test !
scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [None]:
del(columns[0])
X_train_df = pd.DataFrame(X_train,columns=columns)
X_train_df.head()

* Lets take a look to box plot

In [None]:
X_train_df["target"]=y_train
df_melted = pd.melt(X_train_df,id_vars="target",var_name="features",value_name="value")
plt.figure(figsize=(20,6))
sns.boxplot(x="features",y="value",hue="target",data=df_melted)
plt.xticks(rotation=90)
plt.title("Box plot");

<a id = "11"></a><br>
# Modeling

<a id = "12"></a><br>
## KNN

In [None]:
knn= KNeighborsClassifier(n_neighbors=2).fit(X_train,y_train)
y_pred= knn.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)


print("Knn Confusion Matrix:\n",cm)
print("Basic acc. score:",acc)

<a id = "13"></a><br>
# Best KNN Parameters

<a id = "14"></a><br>
## KNN Tuning

In [None]:
def knn_best_params(X_train,X_test,y_train,y_test):
    k_range=list(range(1,31))
    weight_options=["uniform","distance"]
    print()
    param_grid=dict(n_neighbors=k_range,weights=weight_options)
    
    knn= KNeighborsClassifier()
    grid=GridSearchCV(knn,param_grid,cv=10,scoring="accuracy")
    grid.fit(X_train,y_train)
    print("Best acc. score: {}\n Best parameters {}".format(grid.best_score_,grid.best_params_))
    print()
    
    knn = KNeighborsClassifier(**grid.best_params_)
    knn.fit(X_train,y_train)
    y_pred_test= knn.predict(X_test)
    y_pred_train=knn.predict(X_train)
    cm_test=confusion_matrix(y_test,y_pred_test)
    cm_train=confusion_matrix(y_train,y_pred_train)
    
    acc_test= accuracy_score(y_test, y_pred_test)
    acc_train= accuracy_score(y_train, y_pred_train)
    print("Test score {}\n Train score {}".format(acc_test,acc_train))
    
    print("CM Test\n",cm_test)
    print("CM Train\n",cm_train)
    return grid

In [None]:
grid= knn_best_params(X_train,X_test,y_train,y_test)

<a id = "15"></a><br>
# PCA

In [None]:
scaler = StandardScaler()
x_scaled= scaler.fit_transform(X)

In [None]:
pca= PCA(n_components=2)
pca.fit(x_scaled)
x_reduced_pca=pca.transform(x_scaled)
pcadata= pd.DataFrame(x_reduced_pca,columns=["p1","p2"])
pcadata["target"]= y
sns.scatterplot(x="p1",y="p2",hue="target",data=pcadata)
plt.title("p1 vs p2");

In [None]:
X_train_pca,X_test_pca,y_train_pca,y_test_pca=train_test_split(x_reduced_pca,y,test_size=0.3,random_state=42)

grid_pca = knn_best_params(X_train_pca,X_test_pca,y_train_pca,y_test_pca)

In [None]:
cmap_light = ListedColormap(['salmon',  'violet'])
cmap_bold = ListedColormap(['darksalmon', 'purple'])

h = .05 # step size in the mesh
X = x_reduced_pca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_pca.predict(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)
plt.figure(figsize=(16,8))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_pca.best_estimator_.n_neighbors, grid_pca.best_estimator_.weights));

<a id = "16"></a><br>
# NCA

In [None]:
nca=NeighborhoodComponentsAnalysis(n_components=2,random_state=42)
nca.fit(x_scaled,y)

x_reduced_nca = nca.transform(x_scaled)
nca_data=pd.DataFrame(x_reduced_nca,columns=["p1","p2"])
nca_data["target"]=y

plt.figure(figsize=(12,5))
sns.scatterplot(x="p1",y="p2",hue="target",data=nca_data)
plt.title("p1 vs p2");

In [None]:
X_train_nca,X_test_nca,y_train_nca,y_test_nca=train_test_split(x_reduced_nca,y,test_size=0.3,random_state=42)

grid_nca = knn_best_params(X_train_nca,X_test_nca,y_train_nca,y_test_nca)

In [None]:
cmap_light = ListedColormap(['salmon',  'violet'])
cmap_bold = ListedColormap(['darksalmon', 'purple'])

h = .2 # step size in the mesh
X = x_reduced_nca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_nca.predict(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)
plt.figure(figsize=(16,8))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_nca.best_estimator_.n_neighbors, grid_nca.best_estimator_.weights));

<a id = "17"></a><br>
# Conclusion

In [None]:
# let's find the wrong classifications we made.
knn = KNeighborsClassifier(**grid_nca.best_params_)
knn.fit(X_train_nca,y_train_nca)
y_pred_nca = knn.predict(X_test_nca)
acc_test_nca = accuracy_score(y_pred_nca,y_test_nca)
print("Score:   {}".format(knn.score(X_test_nca,y_test_nca)))

In [None]:
test_data = pd.DataFrame()
test_data["X_test_nca_p1"] = X_test_nca[:,0]
test_data["X_test_nca_p2"] = X_test_nca[:,1]
test_data["y_pred_nca"] = y_pred_nca
test_data["y_test_nca"] = y_test_nca

plt.figure(figsize=(10,7))
diff = np.where(y_pred_nca!=y_test_nca)[0]
plt.scatter(test_data.iloc[diff,0],test_data.iloc[diff,1],label = "Wrong Classified",alpha = 0.2,color = "k",s = 1000)

sns.scatterplot(x="X_test_nca_p1", y="X_test_nca_p2", hue="y_test_nca",data=test_data);

* Our final acc. score is : 0.9941520467836257 which is pretty good. (NCA)
* Thank you for your time.


### If you liked this notebook please upvote :)