# Import Libs and Data

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
data.head()

In [None]:
data.drop(["id","Unnamed: 32"], axis =1, inplace = True)

In [None]:
data = data.rename(columns = {"diagnosis" : "target"})

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data['target'])
print(data.target.value_counts())

In [None]:
data['target'] = [1 if i.strip() == 'M' else  0 for i in data['target']]

In [None]:
data.info()

In [None]:
data.describe()

# EDA

In [None]:
corr_matrix = data.corr()
plt.figure(figsize =(30,20))
sns.clustermap(corr_matrix, annot = True, fmt = ".2f")
plt.show()

In [None]:
threshold = 0.7
filtre = np.abs(corr_matrix['target']) > threshold
corr_features = corr_matrix.columns[filtre].tolist()
plt.figure(figsize =(15,15))
sns.clustermap(data[corr_features].corr(),annot=True, fmt=".2f")
plt.title("Correlation Between Features w Corr Threshold 0.70")
plt.show()

In [None]:
data_melted = pd.melt(data, id_vars="target", var_name="features", value_name="value")
data_melted.head(10)

In [None]:
plt.figure(figsize = (20,15))
sns.boxplot(x="features", y="value", hue="target", data = data_melted)
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize =(20,20))
sns.pairplot(data[corr_features], diag_kind = "kde", markers = "+", hue = "target")
plt.show()

In [None]:
y = data['target']
X = data.drop(['target'],axis = 1)
columns = X.columns.tolist()

out_detect = LocalOutlierFactor()
y_pred = out_detect.fit_predict(X)
X_score = out_detect.negative_outlier_factor_

outlier_score = pd.DataFrame()
outlier_score['score'] = X_score

In [None]:
y_pred  # -1 = outlier | 1 = inlier

In [None]:
X_score

In [None]:
plt.figure(figsize = (10,8))
plt.scatter(X.iloc[:,0],X.iloc[:,1], color = 'k', s=3, label = 'Data Points' )

In [None]:
radius = (X_score.max() - X_score) / (X_score.max() - X_score.min())
outlier_score["radius"] = radius
plt.figure(figsize = (15,12))
plt.scatter(X.iloc[:,0],X.iloc[:,1], color = 'k', s=3, label = 'Data Points' )
plt.scatter(X.iloc[:,0], X.iloc[:,1], s=1000*radius, edgecolors="r", facecolors = "none", label = "Outlier Score")
plt.legend()
plt.show()

In [None]:
threshold = -2
filtre = outlier_score["score"] < threshold
outlier_index = outlier_score[filtre].index.tolist()
plt.figure(figsize = (15,12))
plt.scatter(X.iloc[outlier_index,0],X.iloc[outlier_index,1], color = 'blue', facecolors = "blue", label = 'Outlier' )
plt.scatter(X.iloc[:,0],X.iloc[:,1], color = 'k', s=3, label = 'Data Points' )
plt.scatter(X.iloc[:,0], X.iloc[:,1], s=1000*radius, edgecolors="r", facecolors = "none", label = "Outlier Score")
plt.legend()
plt.show()

In [None]:
X = X.drop(outlier_index)
y = y.drop(outlier_index).values

# Train - Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,  random_state = 66)

## Scale Data

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train_df = pd.DataFrame(X_train, columns = columns)
X_train_df.describe()

In [None]:
X_train_df['target'] = y_train
data_melted = pd.melt(X_train_df, id_vars="target", var_name="features", value_name="value")


plt.figure(figsize = (16,12))
sns.boxplot(x="features", y = "value", hue = "target", data = data_melted)
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize =(20,20))
sns.pairplot(X_train_df[corr_features], diag_kind="kde", markers = "+", hue = "target")
plt.show()

# Base KNN Model

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print("Accuracy : ", acc)
print("Confusion Matrix : ",cm)

### Best Params

In [None]:
def KNN_Best_Params(x_train,x_test,y_train,y_test):
    k_range = list(range(1,30))
    weight_options = ["uniform", "distance"]
    print()
    param_grid = dict(n_neighbors = k_range, weights = weight_options)
    
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn, param_grid, cv = 10, scoring = "accuracy")
    grid.fit(x_train, y_train)
    
    print("Best training Score : {} with parameters : {}".format(grid.best_score_, grid.best_params_))
    print()
    
    knn = KNeighborsClassifier(**grid.best_params_)
    knn.fit(x_train, y_train)
    
    y_pred_train = knn.predict(x_train)
    y_pred_test = knn.predict(x_test)
    
    cm_train = confusion_matrix(y_train, y_pred_train)
    cm_test = confusion_matrix(y_test, y_pred_test)
    
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    
    print("Train Acc : {} , Test Acc : {}".format(acc_train, acc_test))
    print()
    print("Train Confusion Matrix ", cm_train)
    print()
    print("Test Confusion Matrix ", cm_test)
    
    return grid

In [None]:
grid = KNN_Best_Params(X_train, X_test, y_train, y_test)

# PCA

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
pca.fit(X_scaled)
X_reduced_pca = pca.transform(X_scaled)

pca_data = pd.DataFrame(X_reduced_pca, columns = ["p1","p2"])
pca_data['target'] = y

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(x="p1", y="p2", hue="target",data=pca_data)

## PCA Train - Test Split

In [None]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_reduced_pca, y, test_size = 0.2,  random_state = 66)

In [None]:
grid_pca = KNN_Best_Params(X_train_pca, X_test_pca, y_train_pca, y_test_pca)

In [None]:
# visualize
cmap_light = ListedColormap(['orange',  'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'darkblue'])

h = .05 # step size in the mesh
X = X_reduced_pca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_pca.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(20,12))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_pca.best_estimator_.n_neighbors, grid_pca.best_estimator_.weights))
plt.show()

# NCA

In [None]:
nca = NeighborhoodComponentsAnalysis(n_components=2, random_state = 42)
nca.fit(X_scaled, y)
X_reduced_nca = nca.transform(X_scaled)

nca_data = pd.DataFrame(X_reduced_nca, columns = ["p1","p2"])
nca_data['target'] = y

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(x="p1", y="p2", hue="target",data=nca_data)

In [None]:
X_train_nca, X_test_nca, Y_train_nca, Y_test_nca = train_test_split(X_reduced_nca, y, test_size = 0.2, random_state = 66)

In [None]:
grid_nca = KNN_Best_Params(X_train_nca, X_test_nca, Y_train_nca, Y_test_nca)

In [None]:
# visualize 
cmap_light = ListedColormap(['orange',  'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'darkblue'])

h = .2 # step size in the mesh
X = X_reduced_nca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

Z = grid_nca.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize = (20,12))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
            edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')"
          % (len(np.unique(y)),grid_nca.best_estimator_.n_neighbors, grid_nca.best_estimator_.weights))