In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import zscore
%matplotlib inline

In [2]:
# Setting nearest neighbors as 5 and weight as distance for the KNN model

knn = KNeighborsClassifier(n_neighbors = 5, weights = 'distance')

In [3]:
df = pd.read_csv('wisc_bc_data.csv')

In [4]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


In [5]:
df.shape

(569, 32)

In [6]:
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [7]:
df.dtypes

id                     int64
diagnosis             object
radius_mean          float64
texture_mean         float64
perimeter_mean       float64
area_mean            float64
smoothness_mean      float64
compactness_mean     float64
concavity_mean       float64
points_mean          float64
symmetry_mean        float64
dimension_mean       float64
radius_se            float64
texture_se           float64
perimeter_se         float64
area_se              float64
smoothness_se        float64
compactness_se       float64
concavity_se         float64
points_se            float64
symmetry_se          float64
dimension_se         float64
radius_worst         float64
texture_worst        float64
perimeter_worst      float64
area_worst           float64
smoothness_worst     float64
compactness_worst    float64
concavity_worst      float64
points_worst         float64
symmetry_worst       float64
dimension_worst      float64
dtype: object

In [8]:
# Changing the datatype of 'diagnosis' to category which will classify its value as 0 or 1 in the backend

df['diagnosis'] = df['diagnosis'].astype('category')

In [9]:
df.dtypes

id                      int64
diagnosis            category
radius_mean           float64
texture_mean          float64
perimeter_mean        float64
area_mean             float64
smoothness_mean       float64
compactness_mean      float64
concavity_mean        float64
points_mean           float64
symmetry_mean         float64
dimension_mean        float64
radius_se             float64
texture_se            float64
perimeter_se          float64
area_se               float64
smoothness_se         float64
compactness_se        float64
concavity_se          float64
points_se             float64
symmetry_se           float64
dimension_se          float64
radius_worst          float64
texture_worst         float64
perimeter_worst       float64
area_worst            float64
smoothness_worst      float64
compactness_worst     float64
concavity_worst       float64
points_worst          float64
symmetry_worst        float64
dimension_worst       float64
dtype: object

In [10]:
# Getting the statistics

df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,569.0,30371830.0,125020600.0,8670.0,869218.0,906024.0,8813129.0,911320500.0
radius_mean,569.0,14.12729,3.524049,6.981,11.7,13.37,15.78,28.11
texture_mean,569.0,19.28965,4.301036,9.71,16.17,18.84,21.8,39.28
perimeter_mean,569.0,91.96903,24.29898,43.79,75.17,86.24,104.1,188.5
area_mean,569.0,654.8891,351.9141,143.5,420.3,551.1,782.7,2501.0
smoothness_mean,569.0,0.09636028,0.01406413,0.05263,0.08637,0.09587,0.1053,0.1634
compactness_mean,569.0,0.104341,0.05281276,0.01938,0.06492,0.09263,0.1304,0.3454
concavity_mean,569.0,0.08879932,0.07971981,0.0,0.02956,0.06154,0.1307,0.4268
points_mean,569.0,0.04891915,0.03880284,0.0,0.02031,0.0335,0.074,0.2012
symmetry_mean,569.0,0.1811619,0.02741428,0.106,0.1619,0.1792,0.1957,0.304


In [11]:
df.groupby(df['diagnosis']).count()

Unnamed: 0_level_0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B,357,357,357,357,357,357,357,357,357,357,...,357,357,357,357,357,357,357,357,357,357
M,212,212,212,212,212,212,212,212,212,212,...,212,212,212,212,212,212,212,212,212,212


In [12]:
# Dropping the 'id' column as it is not required

df = df.drop('id', axis = 1)

In [13]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,0.1959,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,0.1922,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,B,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,0.1714,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,B,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,0.1771,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,0.1721,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


In [14]:
# Creating a new dataframe which contains all the independent variable

df_i = df.drop('diagnosis', axis = 1)
df_i.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,0.1959,0.05955,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,0.1922,0.06491,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,0.1714,0.0634,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,0.1771,0.06072,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,0.1721,0.05544,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


In [15]:
# Creating a new dataframe which contains the dependent variable

df_d = df['diagnosis']
df_d.head()

0    B
1    B
2    B
3    B
4    B
Name: diagnosis, dtype: category
Categories (2, object): ['B', 'M']

In [16]:
# Converting the featues into zscore as we don't know the unit in which they were stored
# Storing them in new dataframe

df_z = df_i.apply(zscore)
df_z.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
radius_mean,569.0,-1.576556e-16,1.00088,-2.029648,-0.689385,-0.215082,0.469393,3.971288
texture_mean,569.0,-7.711075e-16,1.00088,-2.229249,-0.725963,-0.104636,0.584176,4.651889
perimeter_mean,569.0,-1.186319e-16,1.00088,-1.984504,-0.691956,-0.23598,0.499677,3.97613
area_mean,569.0,-2.02923e-16,1.00088,-1.454443,-0.667195,-0.295187,0.363507,5.250529
smoothness_mean,569.0,1.717041e-16,1.00088,-3.112085,-0.710963,-0.034891,0.636199,4.770911
compactness_mean,569.0,1.935573e-16,1.00088,-1.610136,-0.747086,-0.22194,0.493857,4.568425
concavity_mean,569.0,4.05846e-17,1.00088,-1.114873,-0.743748,-0.34224,0.526062,4.243589
points_mean,569.0,-5.619407e-17,1.00088,-1.26182,-0.737944,-0.397721,0.646935,3.92793
symmetry_mean,569.0,1.638994e-16,1.00088,-2.744117,-0.70324,-0.071627,0.530779,4.484751
dimension_mean,569.0,-1.492265e-15,1.00088,-1.819865,-0.722639,-0.178279,0.470983,4.910919


In [17]:
# Storing the independent and dependent variables in numpy array

x = np.array(df_i)
y = np.array(df_d)

In [18]:
x.shape

(569, 30)

In [19]:
y.shape

(569,)

In [20]:
# Splitting the dataset into test and train

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 1)

In [21]:
# Training the model

knn.fit(x_train, y_train)

KNeighborsClassifier(weights='distance')

In [22]:
# Predicting the accuracy

predict = knn.predict(x_test)
knn.score(x_test, y_test)

0.9239766081871345

In [23]:
# Printing the confusion matrix

from sklearn import metrics
print (metrics.confusion_matrix(y_test, predict))

[[108   6]
 [  7  50]]


# Iteration - 2

In [24]:
# For better accuracy
# We may drop 'radius_mean' as area and perimeter are the functions of radius

df_r = df_i.drop('radius_mean', axis = 1)

In [25]:
x = np.array(df_r)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

In [26]:
knn = KNeighborsClassifier(n_neighbors = 20, weights = 'distance')
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=20, weights='distance')

In [27]:
predict = knn.predict(x_test)

In [28]:
knn.score(x_test, y_test)

0.9415204678362573

In [29]:
# Printing the confusion matrix

print (pd.DataFrame(metrics.confusion_matrix(y_test, predict, labels = ['M', 'B']), index = ['True:Yes', 'True:No'], columns = ['Pred:Yes', 'Pred:No']))

          Pred:Yes  Pred:No
True:Yes        51        6
True:No          4      110


In [30]:
# Performance dropped

In [49]:
# Selecting the best k-value for the model

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
l = list(range(1, 50))  # List having nos. between 1 to 50
cv_score = []
k_neigh = []
for i in l:
    k = KNeighborsClassifier(n_neighbors = i)      # Check the model for eack i value 
    scores = cross_val_score(k, x_train, y_train, cv = 10, scoring = 'accuracy')  # Get the model score
    cv_score.append(scores.mean())  # Append cv_score with mean values
    k_neigh.append(i)               # Append k_neigh with i values

In [51]:
# Calculating mean squared error (MSE)

mse = [1 - x for x in cv_score]  # Error = 1 - Accuracy
min(mse)
mse.index(min(mse))
best_k = l[mse.index(min(mse))]
print ("Best k values is:", best_k)

Best k values is: 22


In [58]:
# Plot between k_neigh and MSE to see the minimum value of 'k' on graph

px.line(x = k_neigh, y = mse)