In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [10]:
df= pd.read_csv('breast-cancer-wisconsin.data',header=None) 
df.columns =['id','clump_thickness','unif_cell_Size','unif_cell_shape','Marginal_adhesion','single_epi_cell_size','Bare_nuclei','Bland_chromatin',' Normal_Nucleoli','Mitoses','Class']
#index_col =0 means it will  set the first column as the index
#header = none means no header is used for the data
#we can add the header to the data later

# Attribute Information:

#1. Sample code number: id number
#2. Clump Thickness: 1 - 10
#3. Uniformity of Cell Size: 1 - 10
#4. Uniformity of Cell Shape: 1 - 10
#5. Marginal Adhesion: 1 - 10
#6. Single Epithelial Cell Size: 1 - 10
#7. Bare Nuclei: 1 - 10
#8. Bland Chromatin: 1 - 10
#9. Normal Nucleoli: 1 - 10
#10. Mitoses: 1 - 10
#11. Class: (2 for benign, 4 for malignant)

# all data scaled to a range of 1-10
#Benign-458(65.5%),Malignant-241(34.5%)

In [9]:
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_Size,unif_cell_shape,Marginal_adhesion,single_epi_cell_size,Bare_nuclei,Bland_chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [34]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
clump_thickness         699 non-null int64
unif_cell_Size          699 non-null int64
unif_cell_shape         699 non-null int64
Marginal_adhesion       699 non-null int64
single_epi_cell_size    699 non-null int64
Bare_nuclei             699 non-null object
Bland_chromatin         699 non-null int64
 Normal_Nucleoli        699 non-null int64
Mitoses                 699 non-null int64
Class                   699 non-null int64
dtypes: int64(9), object(1)
memory usage: 54.7+ KB


In [11]:
#Preprocessing steps

#remove the ? with a value of -99999 to denote the missing #16 values 
#remove the unwanted columnns of data like 
#--Id -> has no effect on cancer

In [15]:
df.replace('?',-99999,inplace=True) #basically replacing the missing data with a large outlier

In [13]:
df.drop(['id'],1,inplace=True)

In [14]:
df.head()

Unnamed: 0,clump_thickness,unif_cell_Size,unif_cell_shape,Marginal_adhesion,single_epi_cell_size,Bare_nuclei,Bland_chromatin,Normal_Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [19]:
X=np.array(df.drop(['Class'],1)) #every cols except class
y=np.array(df['Class'])

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size =0.20)

In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=1)  # use the value of K =1

In [31]:
clf = knn.fit(X_train,y_train) #clf is the classifier

In [25]:
#evaluation
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score

In [32]:
predictions=knn.predict(X_test)
print(confusion_matrix(y_test,predictions))


[[92  3]
 [ 3 42]]


In [30]:
clf.score(X_test,y_test)  #accuracy

0.9571428571428572

In [33]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           2       0.97      0.97      0.97        95
           4       0.93      0.93      0.93        45

   micro avg       0.96      0.96      0.96       140
   macro avg       0.95      0.95      0.95       140
weighted avg       0.96      0.96      0.96       140



In [36]:
#test on some unknown data set
example_measures= np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,2,2,2,3,2,1]])
print(example_measures)

[[4 2 1 1 1 2 3 2 1]
 [4 2 1 2 2 2 3 2 1]]


In [38]:
prediction = clf.predict(example_measures)
print(prediction)

[2 2]
