## KNN Algorithms PCA & NCA's Best Parameters for Prediction

----Content

1-Import Dataset

2-Investigation Dataset Visualizaiton

3-Exploratory Data Analysis

4-Feature Investigation 

5-Drop Outliers with LocalOutlierFactor

6-Standardization

7- KNN Prediction with Best Parameters

8- After the PCA, accuracy detection

9- After the NCA, accuracy detection

10- Results Evaluation

## Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap 
#Libraries for ML
from sklearn.preprocessing import StandardScaler #Standardizasyon için
from sklearn.model_selection import train_test_split, GridSearchCV #GridSearchCV: KNN ile ilgili en iyi parametreleri belirlemek
from sklearn.metrics import accuracy_score ,confusion_matrix #Sonuç değerlendirme
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor #Trainin algoritması ve NCA ve Outlier değerler için
from sklearn.decomposition import PCA #PCA için
#Others
import warnings
warnings.filterwarnings('ignore') #Uyarıları kapatmak

## Data Reading & Pre-editing

In [None]:
data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
data.drop(['Unnamed: 32', 'id'],inplace=True,axis=1)

data = data.rename(columns={'diagnosis':'target'})

## Target Class Visualization & Data Description

In [None]:
sns.countplot(data['target']) 
print(data.target.value_counts())

data['target'] = [1 if i.strip() == 'M' else 0 for i in data.target]

print(len(data))
print('Data Shape',data.shape)
print(data.info()) 
describe = data.describe()

## Exploratory Data Analysis

In [None]:
corr_data = data.corr() 
sns.clustermap(corr_data,annot= True,fmt = '.2f')
plt.title('Correlation Between Features')
plt.show();

## Correlation Matrix with threshold

In [None]:
threshold = 0.75 #Bu eşik değeri ile sadece bu değerin üstündeki korelasyonları değerlendireceğiz yeni grafikte

filtre = np.abs(corr_data['target']) > threshold # Burada corelasyon değerleri negatifde olacağından mutlak değerini alıp tresholddan büyükları filtreledik.
corr_feature = corr_data.columns[filtre].tolist()
#Bu değişkene ise korelasyon matrisi sütunlarına filtrenin uygulanmasıyla çıkan çıktıları listeye çevirip atadık.

sns.clustermap(data[corr_feature].corr(),annot= True,fmt = '.2f')
#Buradan eşik değerine uygun olarak elde edilen featureların uygulandığı corr matr. oluşturduk.

plt.title('Correlation Between Features with threshold 0.75')
plt.show();

## Feautre Visualization with Box plot [Before Standardization]

In [None]:
#Box p. öncesi bir melted işlemi gerekitor.
data_melted = pd.melt(data,id_vars='target',
                      var_name='Features',
                      value_name='Value')

plt.figure()
sns.boxplot(x='Features',y='Value',hue='target',data=data_melted) #Featureslar target'a göre ayrıldı.
plt.xticks(rotation=75) #Feature isimleri 90 derece dik görülecek.
plt.show()

"""
We will need standardization.
"""

## Feautre Visualization with Pair plot [Before Standardization]

In [None]:
sns.pairplot(data[corr_feature],diag_kind='kde',markers='+',hue='target')
plt.show()

"""
Data has skewness. We will handle it.
"""

## Outlier Detection

In [None]:
x = data.drop(['target'],axis=1) 
y = data.target
columns = x.columns.tolist() # Featureların isimlerini bir listede topladık.

clf = LocalOutlierFactor() #KNN değeri gerekiyor. Tanımlı değeri 20 ve bizde 20 kullanacağız. O nedenle birşey yazmayacağız.
y_pred = clf.fit_predict(x) #LOF uygulayıo negetif outlier f. alıyoruz.

x_score = clf.negative_outlier_factor_
outlier_score = pd.DataFrame()
outlier_score['score'] = x_score

threshold_outliers = -2
filtre = outlier_score['score'] < threshold_outliers
outlier_index = outlier_score[filtre].index.tolist() #outlier değerlerine threshold uygulanmış olanları bir listeye attık


plt.figure()
plt.scatter(x.iloc[outlier_index,0], x.iloc[outlier_index,1],color = 'blue',s=50,label='outliers')
plt.scatter(x.iloc[:,0]
            ,x.iloc[:,1],color='k',s=3,label='data_point') #s : boyut

radius = (x_score.max() - x_score ) / (x_score.max() - x_score.min() ) #Değerleri normalize ederek biası önledik
outlier_score['radius '] = radius

plt.scatter(x.iloc[:,0], x.iloc[:,1], s=1000*radius, edgecolors='r',facecolor='none',label='Outlier skores')
plt.legend()
plt.show();

## Drop Outliers

In [None]:
x = x.drop(outlier_index) #outliers remove
y = y.drop(outlier_index).values #outliers remove

## Train - Tespt Split

In [None]:
test_size = 0.3

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=test_size,random_state=42)

## Standardizasyon

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train) #scaler isimli bir değişkene Standartscaler metodunu atayıp sonra bu değişken aracılığıyla xtraindeki verileri standardize ettik

x_test = scaler.transform(x_test)

x_train_df = pd.DataFrame(x_train,columns=columns)
x_train_df_describe = x_train_df.describe()
x_train_df['target'] = y_train

## Box Plot Visualization after the Standardization

In [None]:
data_melted = pd.melt(x_train_df,id_vars='target',
                      var_name='Features',
                      value_name='Value')

plt.figure()
sns.boxplot(x='Features',y='Value',hue='target',data=data_melted) #Featureslar target'a göre ayrıldı.
plt.xticks(rotation=75) #Feature isimleri 90 derece dik görülecek.
plt.show()


## Pair Plot Visualization after the Standardization

In [None]:

sns.pairplot(x_train_df[corr_feature],diag_kind='kde',markers='+',hue='target')
plt.show()

## KNN Implementation & Obtaining First Accuracy Score

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(x_train, y_train) #Calculation (In the supervise learning this section = training)
y_predict = knn.predict(x_test) #Test Section
cm = confusion_matrix(y_test, y_predict) #Plooting
acc = accuracy_score(y_test, y_predict) #Accuracy Score
score = knn.score(x_test, y_test) #acc ile burasının sonucu aynı çıkacak. Doğrulama amaçlı yapılıyor.

print("Score:",score)
print("CM:",cm)
print("Basic KNN Acc:",acc)

## Choose Best Parameters with GridSearchCrossValidation (Function)

In [None]:
def KNN_best_parameters(x_train,x_test,y_train,y_test):
    
    k_range = list(range(1,51)) #En uygun k değeri buluma
    weight_options = ['uniform','distance'] #En uygun weighti buluma
    #manhattan_distance = 1
    #euclidean_distance = 2
    distance_options = [1,2] #En uygun distance type buluma
    print()
    param_grid = dict(n_neighbors=k_range,weights=weight_options,p=distance_options) #Aranacak parametreleri bir sözlükte topladık.

    knn =KNeighborsClassifier() #Parametrelerin deneneceği knn oluşturuldu.
    grid = GridSearchCV(knn,param_grid,cv=10,scoring='accuracy') #Parametrelerin aranması için method
    grid.fit(x_train, y_train) #fitting ile best parm. elde edildi
    
    print('Best training score: {} with parametres: {}'.format(grid.best_score_,grid.best_params_))
    print()
    
    knn = KNeighborsClassifier(**grid.best_params_) #Test setinde deneme işlemi için
    knn.fit(x_train, y_train)
    
    y_predict_test = knn.predict(x_test)
    y_predict_train = knn.predict(x_train)

    cm_test = confusion_matrix(y_test,y_predict_test)
    cm_train = confusion_matrix(y_train,y_predict_train)

    acc_test = accuracy_score(y_test,y_predict_test)  
    acc_train = accuracy_score(y_train,y_predict_train)

    print('Test Score: {}, Train Score: {}'.format(acc_test,acc_train))
    print()
    print('CM Test:',cm_test)
    print('CM Train:',cm_train)
    
    return grid

## Obtaining KNN score the best parameters 

In [None]:
grid = KNN_best_parameters(x_train,x_test,y_train,y_test)

In [None]:
basic_best_acc =grid.best_score_
basic_best_acc

## Principal Component Analysis

In [None]:
#%% PCA

scale = StandardScaler()
x_scaled = scaler.fit_transform(x) #x verisi bölünmeden tam bir şekilde PCA için scale edildi. 

pca = PCA(n_components=2) #2 componentli bir PCA oluşturduk.
pca.fit(x_scaled)
x_reduce_pca = pca.transform(x_scaled) #2feature'a yani boyuta düşürülmüş x 
pca_data =pd.DataFrame(x_reduce_pca,columns=['p1','p2']) #reduce datadan incelemek için bir dataframe oluşturuldu
pca_data['target'] = y #buna target eklendi. Görselleştirmek için gerekli.

sns.scatterplot(x='p1',y='p2',hue='target',data=pca_data) # targeta göre renklendirilmiş grafik
plt.title('PCA: P1 Vs P2')

## After PCA, obtaining KNN score the best parameters 

In [None]:
x_train_pca, x_test_pca, y_train_pca, y_test_pca = train_test_split(x_reduce_pca,y,test_size=test_size,random_state=42)

grid_pca = KNN_best_parameters(x_train_pca, x_test_pca, y_train_pca, y_test_pca)
#en iyi parametreleri elde ettiğimiz metodu PCA için çalıştırıyorum.

In [None]:
pca_best_acc = grid_pca.best_score_
pca_best_acc

## Wrong classification Visualization in PCA_KNN

In [None]:
cmap_light = ListedColormap(['orange',  'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'darkblue'])

h = .05 # step size in the mesh
X = x_reduce_pca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_pca.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_pca.best_estimator_.n_neighbors, grid_pca.best_estimator_.weights))

## Neighborhood Components Analysis

In [None]:
nca = NeighborhoodComponentsAnalysis(n_components=2,random_state=42)
#NCA PCA'in aksine unsupervise learning değildir y'lere yani targetlara ihtiyaç duyar. 
nca.fit(x_scaled,y)
x_reduce_nca = nca.transform(x_scaled)
nca_data = pd.DataFrame(x_reduce_nca,columns=['p1','p2'])
nca_data['target'] = y

sns.scatterplot(x='p1',y='p2',hue='target',data=nca_data)
plt.title('NCA: P1 vs P2')


## After NCA, obtaining KNN score the best parameters 

In [None]:
x_train_nca, x_test_nca, y_train_nca, y_test_nca = train_test_split(x_reduce_nca,y,test_size=test_size,random_state=42)

grid_nca = KNN_best_parameters(x_train_nca, x_test_nca, y_train_nca, y_test_nca)

In [None]:
nca_best_acc = grid_nca.best_score_

## Wrong classification in NCA_KNN

In [None]:
cmap_light = ListedColormap(['orange',  'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'darkblue'])

h = .2 # step size in the mesh
X = x_reduce_nca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_nca.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_nca.best_estimator_.n_neighbors, grid_nca.best_estimator_.weights))

# Results Evaluation

In [None]:
# plotly
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import itertools
plt.style.use('fivethirtyeight')

In [None]:
AlgorthmsName = ['KNN_Training_Accuracy_with_Best_Parameters','KNN-PCA_Training_Accuracy_with_Best_Parameters','KNN-NCA_Training_Accuracy_with_Best_Parameters']
scoresf1=[basic_best_acc,pca_best_acc,nca_best_acc]
#create traces

trace1 = go.Scatter(
    x = AlgorthmsName,
    y= scoresf1,
    name='Algortms Name',
    marker =dict(color='rgba(225,126,0,0.5)',
               line =dict(color='rgb(0,0,0)',width=2)),
                text=AlgorthmsName
)
data = [trace1]

layout = go.Layout(barmode = "group", 
                  xaxis= dict(title= 'Traning Type',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'Prediction Scores(Acc)',ticklen= 5,zeroline= False))
fig = go.Figure(data = data, layout = layout)
iplot(fig)

# If you like this kernel, Please Upvote :) Thanks

**Burak Kahveci**

* My Linkedin Account: https://www.linkedin.com/in/kahveciburak/
* My Twitter Account: https://twitter.com/ImpartialBrain