In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("Breast-Cancer-Dataset.csv",header=None)


In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,7,8,9,10
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [5]:
col_names = ['id', 'thickness', 'cell_size', 'cell_shape', 'marginal_adhesion', 
             'eingle_epithelial_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'Class']

In [6]:
df.columns = col_names

In [7]:
df.head()

Unnamed: 0,id,thickness,cell_size,cell_shape,marginal_adhesion,eingle_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [8]:
df.isna().sum()

id                             0
thickness                      0
cell_size                      0
cell_shape                     0
marginal_adhesion              0
eingle_epithelial_cell_size    0
bare_nuclei                    0
bland_chromatin                0
normal_nucleoli                0
mitoses                        0
Class                          0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           699 non-null    int64 
 1   thickness                    699 non-null    int64 
 2   cell_size                    699 non-null    int64 
 3   cell_shape                   699 non-null    int64 
 4   marginal_adhesion            699 non-null    int64 
 5   eingle_epithelial_cell_size  699 non-null    int64 
 6   bare_nuclei                  699 non-null    object
 7   bland_chromatin              699 non-null    int64 
 8   normal_nucleoli              699 non-null    int64 
 9   mitoses                      699 non-null    int64 
 10  Class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [10]:
df['bare_nuclei'] = pd.to_numeric(df['bare_nuclei'], errors='coerce')

In [11]:
df.dtypes

id                               int64
thickness                        int64
cell_size                        int64
cell_shape                       int64
marginal_adhesion                int64
eingle_epithelial_cell_size      int64
bare_nuclei                    float64
bland_chromatin                  int64
normal_nucleoli                  int64
mitoses                          int64
Class                            int64
dtype: object

In [12]:
df.isna().sum()

id                              0
thickness                       0
cell_size                       0
cell_shape                      0
marginal_adhesion               0
eingle_epithelial_cell_size     0
bare_nuclei                    16
bland_chromatin                 0
normal_nucleoli                 0
mitoses                         0
Class                           0
dtype: int64

In [13]:
df['bare_nuclei'] = df['bare_nuclei'].fillna(df['bare_nuclei'].median())

In [14]:
df.isna().sum()

id                             0
thickness                      0
cell_size                      0
cell_shape                     0
marginal_adhesion              0
eingle_epithelial_cell_size    0
bare_nuclei                    0
bland_chromatin                0
normal_nucleoli                0
mitoses                        0
Class                          0
dtype: int64

In [15]:
X = df.drop(['Class'], axis=1)
Y = df['Class']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [17]:
X_train.shape

(559, 10)

In [18]:
cols = X_train.columns

In [19]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
X_train

array([[-1.08564892,  2.02838346,  0.2995061 , ..., -0.57777378,
         0.04124139, -0.32425801],
       [ 0.13804167,  1.66945141,  2.25768045, ..., -0.1599529 ,
         0.04124139, -0.32425801],
       [-0.13312821, -1.20200501, -0.67958108, ..., -0.99559467,
        -0.60816532, -0.32425801],
       ...,
       [ 0.38253213, -0.12520886, -0.67958108, ..., -0.99559467,
        -0.60816532, -0.32425801],
       [ 0.62721986,  0.2337232 , -0.67958108, ..., -0.57777378,
        -0.60816532, -0.32425801],
       [-1.4049915 , -1.20200501, -0.67958108, ..., -0.99559467,
        -0.60816532, -0.32425801]])

In [21]:
X_train = pd.DataFrame(X_train, columns=[cols])

In [22]:
X_test = pd.DataFrame(X_test, columns=[cols])

In [23]:
X_train.head()

Unnamed: 0,id,thickness,cell_size,cell_shape,marginal_adhesion,eingle_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
0,-1.085649,2.028383,0.299506,0.289573,1.119077,-0.546543,1.858357,-0.577774,0.041241,-0.324258
1,0.138042,1.669451,2.25768,2.304569,-0.622471,3.106879,1.297589,-0.159953,0.041241,-0.324258
2,-0.133128,-1.202005,-0.679581,-0.717925,0.074148,-1.00322,-0.104329,-0.995595,-0.608165,-0.324258
3,0.474433,-0.125209,-0.026856,-0.04626,-0.622471,-0.546543,-0.665096,-0.159953,0.041241,-0.324258
4,-0.684445,0.233723,-0.353219,-0.382092,-0.274161,-0.546543,-0.665096,-0.577774,-0.283462,-0.324258


In [24]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [25]:
y_pred = knn.predict(X_test)
y_pred

array([2, 2, 4, 2, 4, 2, 4, 2, 4, 4, 2, 2, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,
       2, 2, 2, 4, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2,
       4, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 4,
       4, 2, 2, 4, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 4, 2,
       4, 4, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2,
       4, 4, 4, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 4, 2, 2, 4, 4, 4, 4, 4, 2,
       2, 4, 4, 2, 2, 4, 2, 2])

In [26]:
print('Training set score: {:.4f}'.format(knn.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(knn.score(X_test, y_test)))

Training set score: 0.9785
Test set score: 0.9643
