In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier,NeighborhoodComponentsAnalysis,LocalOutlierFactor
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.decomposition import PCA
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
data.head()

In [None]:
# Data Analyse

In [None]:
data.drop(['id','Unnamed: 32'],inplace=True,axis=1)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.index

In [None]:
data.isnull().sum()

In [None]:
data.columns

In [None]:
# count diagnosis
print(data['diagnosis'].value_counts())
sns.countplot('diagnosis',data=data);

In [None]:
# or
data['classes'] = [1 if i.strip() == "M" else 0 for i in data['diagnosis']]

In [None]:
lb = LabelEncoder()
data['diagnosis']=lb.fit_transform(data['diagnosis'])
data.head()

In [None]:
# statistics

data.describe().T

In [None]:
plt.figure(figsize=(25,10))
sns.heatmap(data.corr(),annot=True);

In [None]:
# correlation with clusters
corr_matrix = data.corr()
plt.figure(figsize=(25,20))
sns.clustermap(corr_matrix,annot = True,fmt='.2f');

In [None]:
# diagnosis correlations wth other features
treshold = 0.75
filter= np.abs(corr_matrix['diagnosis']) > treshold
corr_treshold = corr_matrix.columns[filter]
plt.figure(figsize=(20,15))
sns.clustermap(data[corr_treshold].corr(),annot = True,fmt='.2f');

In [None]:
# pair plot
treshold = 0.50
filter= np.abs(corr_matrix['diagnosis']) > treshold
corr_treshold = corr_matrix.columns[filter]
sns.pairplot(data[corr_treshold],diag_kind='kde',hue='diagnosis');

In [None]:
# Outlier detect
# Our data ist skewness and wir use LOF(Local Outleir Factor)

In [None]:
# classes
y = data[['classes']]

# features
x = data.drop(['diagnosis','classes'],axis = 1)

In [None]:
lof = LocalOutlierFactor()
preds = lof.fit_predict(x)
score = lof.negative_outlier_factor_

In [None]:
score.min(),score.max()

In [None]:
# treshold = -2.0
treshold = -2.0
score_df = pd.DataFrame()
score_df['score'] = score
filter_score = score_df['score'] < treshold
outlier_index = score_df[filter_score].index.tolist()

In [None]:
outlier_index

In [None]:
x = x.drop(outlier_index)
y = y.drop(outlier_index)

In [None]:
# train test split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25,random_state = 42)

In [None]:
# standartizasyon

In [None]:
sc = StandardScaler()

In [None]:
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
# KNN Classifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = knn.predict(x_test)
score_test = knn.score(x_test,y_test)
score_train = knn.score(x_train,y_train)
cm = confusion_matrix(y_test,y_pred)

print('Test score without evaluate: ',score_test)
print('Train score : ',score_train)
print('Confusion metrix reusult: ',cm)

In [None]:
# Best Parameters

In [None]:
from sklearn.metrics import accuracy_score
def best_params(x_train,x_test,y_train,y_test):
    # k neighbors
    k = list(range(1,21))
    weights = ['uniform','distance']
    p = [1,2]
    
    params = dict(n_neighbors=k,weights = weights,p=p)
    
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn,param_grid=params,cv=10,scoring='accuracy')
    grid.fit(x_train,y_train)
    
    print('Best Training score {} and parameters {}'.format(grid.best_score_,grid.best_params_))
    
    knn2 = KNeighborsClassifier(**grid.best_params_)
    knn2.fit(x_train,y_train)
    
    y_pred_test = knn2.predict(x_test)
    y_pred_train = knn2.predict(x_train)
    
    acc_test = accuracy_score(y_test,y_pred_test)
    acc_train = accuracy_score(y_train,y_pred_train)
    
    cm_test = confusion_matrix(y_test,y_pred_test)
    cm_train = confusion_matrix(y_train,y_pred_train)
    
    print('Accuracy Test: ',acc_test)
    print('CM Test: ',cm_test)
    print()
    print('Accuracy Train: ',acc_train)
    print('CM Train: ',cm_train)
    
    return grid

In [None]:
best_params(x_train,x_test,y_train,y_test)

In [None]:
# PCA 

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [None]:
pca = PCA(n_components=2)
pca.fit(x_scaled)
x_pca = pca.transform(x_scaled)
pca_data = pd.DataFrame(x_pca,columns=['p1','p2'])
pca_data['classes'] = y

In [None]:
sns.scatterplot(x='p1',y='p2',hue='classes',data=pca_data);

In [None]:
X_train_pca, X_test_pca, Y_train_pca, Y_test_pca = train_test_split(x_pca, y, test_size = 0.25, random_state = 42)

grid_pca = best_params(X_train_pca, X_test_pca, Y_train_pca, Y_test_pca)


In [None]:
# visualize 
cmap_light = ListedColormap(['orange',  'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'darkblue'])

h = .05 # step size in the mesh
X = pca_data
x_min, x_max = X['p1'].min() - 1, X['p1'].max() + 1
y_min, y_max = X['p2'].min() - 1, X['p2'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_pca.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light);
# Plot also the training points

plt.scatter(X['p1'].values, X['p2'].values, c=X['classes'], cmap=cmap_bold,edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_pca.best_estimator_.n_neighbors, grid_pca.best_estimator_.weights));

In [None]:
pca_data.head()

In [None]:
# NCA - NeighborhoodComponentsAnalysis

In [None]:
nca = NeighborhoodComponentsAnalysis(n_components=2,random_state=42)
nca.fit(x_scaled,y)
x_nca = nca.transform(x_scaled)

In [None]:
nca_data = pd.DataFrame(x_nca,columns=['p1','p2'])
nca_data['classes'] =  y

In [None]:
sns.scatterplot(x = 'p1',y = 'p2', hue = 'classes',data = nca_data);

In [None]:
X_train_nca, X_test_nca, Y_train_nca, Y_test_nca = train_test_split(x_nca, y, test_size = 0.25, random_state = 42)

grid_nca = best_params(X_train_nca, X_test_nca, Y_train_nca, Y_test_nca)

In [None]:
# visualize 
cmap_light = ListedColormap(['orange',  'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'darkblue'])

h = .2 # step size in the mesh
X = x_nca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_nca.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=nca_data['classes'], cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_nca.best_estimator_.n_neighbors, grid_nca.best_estimator_.weights))

In [None]:
# %% find wrong decision
knn = KNeighborsClassifier(**grid_nca.best_params_)
knn.fit(X_train_nca,Y_train_nca)
y_pred_nca = knn.predict(X_test_nca)
acc_test_nca = accuracy_score(y_pred_nca,Y_test_nca)
knn.score(X_test_nca,Y_test_nca)

test_data = pd.DataFrame()
test_data["X_test_nca_p1"] = X_test_nca[:,0]
test_data["X_test_nca_p2"] = X_test_nca[:,1]
test_data["y_pred_nca"] = y_pred_nca
test_data["Y_test_nca"] = Y_test_nca

plt.figure()
sns.scatterplot(x="X_test_nca_p1", y="X_test_nca_p2", hue="Y_test_nca",data=test_data)

diff = np.where(y_pred_nca!= test_data["Y_test_nca"])[0]
plt.scatter(test_data.iloc[diff,0],test_data.iloc[diff,1],label = "Wrong Classified",alpha = 0.2,color = "red",s = 1000);