In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
df=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.drop(['Unnamed: 32'], axis=1, inplace=True)

In [None]:
df.set_index('id', inplace=True)

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
df.info()

# EDA

In [None]:
data=df.copy()

In [None]:
data.diagnosis=data.diagnosis.replace(['M', 'B'], ['Malignant', 'Benign'])
plt.figure(figsize=(5,4))
sns.histplot(x='diagnosis', data=data)
plt.title('Diagnosis distribution')
plt.xlabel('Diagnosis')
plt.ylabel('Number of Cases')
plt.show()

In [None]:
fig, ax = plt.subplots(10, 3, figsize=(25,45))
row, col_= 0, 0
for col in data.columns[1::]:
    sns.histplot(data, x=data[col], hue="diagnosis", element="poly", stat="count",  
                 palette='rocket', ax=ax[row][col_])
    col_+=1
    if col_==3:
        row+=1
        col_=0

# Outlier Analysis

In [None]:
df.describe().T

In [None]:
fig, ax = plt.subplots(10, 3, figsize=(25,45))
row, col_= 0, 0
for col in df.columns[1::]:
    sns.boxplot(df[col], ax=ax[row][col_])
    col_+=1
    if col_==3:
        row+=1
        col_=0

In [None]:
def thresholds(df, col_name, q1=0.05, q3=0.90):
    quartile1 = df[col_name].quantile(q1)
    quartile3 = df[col_name].quantile(q3)
    IQR = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * IQR
    low_limit = quartile1 - 1.5 * IQR
    df.loc[(df[col_name] < low_limit), col_name] = low_limit
    df.loc[(df[col_name] > up_limit), col_name] = up_limit
    return df

In [None]:
for col in df.columns[1::]:
    df1=thresholds(df, col, 0.05, 0.90)

In [None]:
fig, ax = plt.subplots(10, 3, figsize=(25,45))
row, col_= 0, 0
for col in df1.columns[1::]:
    sns.boxplot(df1[col], ax=ax[row][col_])
    col_+=1
    if col_==3:
        row+=1
        col_=0

# Local Outlier Factor

In [None]:
df.diagnosis=df.diagnosis.replace(['M', 'B'], ['1', '0'])

In [None]:
from sklearn.neighbors import LocalOutlierFactor

In [None]:
def local_outlier_factor(df, n_neighs=10):
    lof=LocalOutlierFactor(n_neighbors=n_neighs)
    outliers=lof.fit_predict(df)
    df1=df.drop(list(df[outliers==-1].index), axis=0)
    print(f'{df[outliers==-1].shape[0]} rows deleted')
    return df1

In [None]:
lof_data=local_outlier_factor(df)

In [None]:
lof_data.shape

# Scailing

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
def scaler(df):
    rs = RobustScaler()
    col_names=[col for col in df.columns if col!='diagnosis']
    df[col_names] = rs.fit_transform(df[col_names])
    return df

In [None]:
scaled_data=scaler(lof_data)

In [None]:
scaled_data.shape

# Building Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
Y=scaled_data.diagnosis
X=scaled_data.drop(['diagnosis'], axis=1)
print(Y.shape, X.shape)

In [None]:
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, test_size=0.25, random_state=42)

In [None]:
knn_params={
    'n_neighbors': list(range(2,6)),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': list(range(1,6)),
}

In [None]:
clf=GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, return_train_score=True)
clf.fit(X_train, Y_train)
Y_test_pred=clf.predict(X_test)
acc_score=accuracy_score(Y_test, Y_test_pred)
print(f'Accuracy Score: {np.round(acc_score*100,2)}%')
print(f'Best params: {clf.best_params_}')