## Content:

- <a href='#1.'> 1. Importing Libraries</a>
- <a href='#2.'> 2. Loading and Checking Data</a>
- <a href='#3.'> 3. Explatory Data Analysis</a>
- <a href='#4.'> 4. Outlier Detection</a>
- <a href='#5.'> 5. Train-Test-Split</a>
- <a href='#6.'> 6. Standardization</a>
- <a href='#7.'> 5. KNN Implementation</a>
- <a href='#8.'> 8. Choosing KNN Best Parameters</a>
- <a href='#9.'> 9. Principal Component Analysis</a>
- <a href='#10.'> 10. Neighborhood Component Analysis</a>
- <a href='#11.'> 11. Evaluating Results</a>
- <a href='#12.'> 12. References</a>

## <a id='1.'>1. Importing Libraries</a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier,NeighborhoodComponentsAnalysis, LocalOutlierFactor
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

## <a id='2.'>2. Loading and Checking Data</a>

In [None]:
data = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")

In [None]:
data.head()

In [None]:
# Dropping useless features
data.drop(["Unnamed: 32", "id"], inplace=True, axis=1)

In [None]:
data.shape

In [None]:
data = data.rename(columns = {"diagnosis":"target"})

In [None]:
sns.countplot(data["target"])
print(data.target.value_counts())

In [None]:
data["target"] = [1 if i.strip()=="M" else 0 for i in data.target]

In [None]:
print(len(data))

In [None]:
print("Data shape:", data.shape)

In [None]:
data.info()

There are no missing values.

In [None]:
data.describe()

We see that we need stardardization.

## <a id='3.'>3. Exploratory Data Analysis</a>

In [None]:
corr_matrix = data.corr()
sns.clustermap(corr_matrix, annot=True, fmt=".1f")
plt.title("Correlation Between Features")
plt.show()

In [None]:
threshold = 0.5
filtre = np.abs(corr_matrix["target"]) > threshold
corr_features = corr_matrix.columns[filtre].tolist()
sns.clustermap(data[corr_features].corr(), annot=True, fmt=".2f")

There are some correlated features.

In [None]:
#box plot
data_melted = pd.melt(data, id_vars = "target",
                     var_name = "features",
                     value_name = "value")
plt.figure()
sns.boxplot(x="features", y="value", hue="target", data=data_melted)
plt.xticks(rotation=90)
plt.show()

To get a meaningful visual we need stardardization.

In [None]:
sns.pairplot(data[corr_features], diag_kind="kde", markers="+", hue="target")
plt.show()

- Positive skewness

## <a id='4.'>4. Outlier Detection</a>

In [None]:
y = data.target
x = data.drop(["target"], axis=1)
columns = x.columns.tolist()

In [None]:
clf = LocalOutlierFactor()
y_pred = clf.fit_predict(x)
y_pred

In [None]:
x_score = clf.negative_outlier_factor_
outlier_score = pd.DataFrame()
outlier_score["score"] = x_score

In [None]:
outlier_score["score"]

In [None]:
threshold = -2.5
filtre = outlier_score["score"] < threshold
outlier_index = outlier_score[filtre].index.tolist()

plt.figure()
plt.scatter(x.iloc[outlier_index, 0], x.iloc[outlier_index, 1], color="blue", s=50, label="Outliers")
plt.scatter(x.iloc[:,0],x.iloc[:,1], color="k", s=3, label="Data Points")

radius = (x_score.max() - x_score) / (x_score.max() - x_score.min())
outlier_score["radius"] = radius
plt.scatter(x.iloc[:,0],x.iloc[:,1], edgecolors="r", s=1000*radius, facecolors="none",  label="Outlier Scores")
plt.legend()
plt.show()

In [None]:
#drop outliers
x = x.drop(outlier_index)
y = y.drop(outlier_index).values

## <a id='5.'>5. Train-Test-Split</a>

In [None]:
test_size = 0.3
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=test_size, random_state=42)

## <a id='6.'>6. Standardization</a>

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train_df = pd.DataFrame(X_train, columns=columns)
X_train_df_describe = X_train_df.describe()
X_train_df["target"] = Y_train

In [None]:
#box plot after standardization
data_melted = pd.melt(X_train_df, id_vars = "target",
                     var_name = "features",
                     value_name = "value")
plt.figure(figsize=(20,10))
sns.boxplot(x="features", y="value", hue="target", data=data_melted)
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
# pairplot after standardization
sns.pairplot(X_train_df[corr_features], diag_kind="kde", markers="+", hue="target")
plt.show()

## <a id='7.'>7. KNN Implementation</a>

In [None]:
#Basic KNN Method

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_test)
cm = confusion_matrix(Y_test, y_pred)
acc = accuracy_score(Y_test, y_pred)
score = knn.score(X_test, Y_test)
print("Score: ", score)
print("CM:", cm)
print("Basic KNN Acc: ", acc)


## <a id='8.'>8. Choosing KNN Best Parameters</a>

In [None]:
def KNN_Best_Params(x_train, x_test, y_train, y_test):
    k_range = list(range(1,31))
    weight_options = ["uniform", "distance"]
    print()
    param_grid = dict(n_neighbors=k_range, weights=weight_options)
    
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn, param_grid, cv=10, scoring="accuracy")
    grid.fit(x_train, y_train)

    print("Best training score: {} with parameters: {}".format(grid.best_score_, grid.best_params_))
    print()

    knn = KNeighborsClassifier(**grid.best_params_)
    knn.fit(x_train, y_train)

    y_pred_test = knn.predict(x_test)
    y_pred_train = knn.predict(x_train)

    cm_test = confusion_matrix(y_test, y_pred_test)
    cm_train = confusion_matrix(y_train, y_pred_train)

    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    print("Test Score: {}, Train Score: {}".format(acc_test, acc_train))
    print()
    print("CM Test: ", cm_test)
    print("CM Train: ", cm_train)

    return grid

In [None]:
grid = KNN_Best_Params(X_train, X_test, Y_train, Y_test)

## <a id='9.'>9. Principal Component Analysis</a>

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

pca = PCA(n_components=2)
pca.fit(x_scaled)
X_reduced_pca = pca.transform(x_scaled)
pca_data = pd.DataFrame(X_reduced_pca, columns=["p1", "p2"])
pca_data["target"] = y
plt.figure(figsize=(20,10))
sns.scatterplot(x="p1", y="p2" , hue="target", data=pca_data)
plt.title("PCA: p1 vs p2")

In [None]:
X_train_pca, X_test_pca, Y_train_pca, Y_test_pca = train_test_split(X_reduced_pca, y, test_size=test_size, random_state=42)

In [None]:
grid_pca = KNN_Best_Params(X_train_pca, X_test_pca, Y_train_pca, Y_test_pca)

In [None]:
cmap_light = ListedColormap(["orange", "cornflowerblue"])
cmap_bold = ListedColormap(["darkorange", "darkblue"])

h = 0.05
X = X_reduced_pca
x_min, x_max = X[:,0].min() - 1, X[:,0].max() + 1
y_min, y_max = X[:,1].min() - 1, X[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_pca.predict(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)
plt.figure(figsize=(12,9))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold,
           edgecolors="k", s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = %s)"
         % (len(np.unique(y)), grid_pca.best_estimator_.n_neighbors, grid_pca.best_estimator_.weights))

## <a id='10.'>10. Neighborhood Component Analysis</a>

In [None]:
nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=42)
nca.fit(x_scaled, y)
X_reduced_nca = nca.transform(x_scaled)
nca_data = pd.DataFrame(X_reduced_nca, columns=["p1", "p2"])
nca_data["target"] = y
sns.scatterplot(x="p1", y="p2", hue="target", data=nca_data)
plt.title("NCA: p1 vs p2")

In [None]:
X_train_nca, X_test_nca, Y_train_nca, Y_test_nca = train_test_split(X_reduced_nca, y, test_size=test_size, random_state=42)

In [None]:
grid_nca = KNN_Best_Params(X_train_nca, X_test_nca, Y_train_nca, Y_test_nca)

In [None]:
cmap_light = ListedColormap(["orange", "cornflowerblue"])
cmap_bold = ListedColormap(["darkorange", "darkblue"])

h = 0.2
X = X_reduced_nca
x_min, x_max = X[:,0].min() - 1, X[:,0].max() + 1
y_min, y_max = X[:,1].min() - 1, X[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_nca.predict(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)
plt.figure(figsize=(12,9))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

plt.scatter(X[:,0], X[:,1], c=y, cmap=cmap_bold,
           edgecolors="k", s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = %s)"
         % (len(np.unique(y)), grid_nca.best_estimator_.n_neighbors, grid_nca.best_estimator_.weights))

## <a id='11.'>11. Evaluating Results</a>

In [None]:
knn = KNeighborsClassifier(**grid_nca.best_params_)
knn.fit(X_train_nca, Y_train_nca)
y_pred_nca = knn.predict(X_test_nca)
acc_test_nca = accuracy_score(y_pred_nca, Y_test_nca)
knn.score(X_test_nca, Y_test_nca)

test_data = pd.DataFrame()
test_data["X_test_nca_p1"] = X_test_nca[:,0]
test_data["X_test_nca_p2"] = X_test_nca[:,1]
test_data["y_pred_nca"] = y_pred_nca
test_data["Y_test_nca"] = Y_test_nca

plt.figure()
sns.scatterplot(x="X_test_nca_p1", y="X_test_nca_p2", hue="Y_test_nca", data=test_data)

diff = np.where(y_pred_nca!=Y_test_nca)[0]
plt.scatter(test_data.iloc[diff,0],test_data.iloc[diff,1],label = "Wrong Classified",alpha = 0.2,color = "red",s = 1000)

## <a id='12.'>12. References</a>

- https://www.kaggle.com/kanncaa1

- https://www.udemy.com/course/machine-learning-ve-python-adan-zye-makine-ogrenmesi-4/learn/lecture/17895310#overview

- https://www.udemy.com/course/python-ile-makine-ogrenmesi-yapay-zeka-projeleri-52/

- https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html

- https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

- https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

- https://scikit-learn.org/stable/auto_examples/neighbors/plot_nca_dim_reduction.html