In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

df = pd.read_csv("../input/data.csv", index_col = 'id')
df.drop('Unnamed: 32',axis = 1 ,inplace = True)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B':0})
X = df.drop('diagnosis',axis = 1)
perimeters = [x for x in df.columns if 'perimeter' in x]
areas = [x for x in df.columns if 'area' in x]
df.drop(perimeters, axis = 1 ,inplace = True)
df.drop(areas, axis = 1 ,inplace = True)
worst = [col for col in df.columns if col.endswith('_worst')]
df.drop(worst, axis = 1 ,inplace = True)

Hello Kagglers! This work is part of my ongoing project in Predictive Analytics to classify Breast Cancer tumors: whether it's Malignant or Benign.The first part, which is the Explanatory Data Analysis and data visualization, was done [here](http://www.kaggle.com/sulianova/feature-explanation-and-eda). For PCA and Random Forest application please see [this page](http://www.kaggle.com/sulianova/pca-logistic-regression-and-random-forest).

# kNN

Features with a larger range of values can dominate the distance metric relative to features that have a smaller range, so feature scaling is important. For continuous data, kNN uses a distance metric like Euclidean or Minkowski distance. As all features are numerical, we do not need to change default metric, which is 'minkowski'.

In [None]:
y = df['diagnosis']
X = df.drop(['diagnosis'], axis=1).values
X_scaled = StandardScaler().fit_transform(X)

#Define k-NN classifier and train on a scaled dataset
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_scaled, y)

To assign the class, when neighbors don’t have the same class, we can set 'weights' parameter:

1. = 'uniform' takes a simple majority vote from the neighbors. Whichever class has the greatest number of votes becomes the class for the new data point.
2. = 'distance' takes a similar vote except gives a heavier weight to those neighbors that are closer. For example, if the neighbor is 5 units away, then weight its vote 1/5. As the neighbor gets further away, the weight gets smaller.

Let's find out which parameter is better for our dataset:

In [None]:
knn_params = {'n_neighbors': range(1, 11), 'weights':['uniform', 'distance']}

X_scaled_train, X_scaled_holdout, y_train, y_holdout = train_test_split(X_scaled, y, test_size=0.3,
                                                                        random_state=17)

#knn_grid.best_estimator_.predict(X_scaled_train)
knn_grid = GridSearchCV(knn, knn_params, cv=10, n_jobs=-1, scoring='recall')

knn_grid.fit(X_scaled_train, y_train)

knn_grid.best_params_, knn_grid.best_score_

In [None]:
pred = knn_grid.best_estimator_.predict(X_scaled_holdout)

print ("Accuracy Score : ",accuracy_score(y_holdout, pred))
print ("Recall Score (how much of malignant tumours were predicted correctly) : ",recall_score(y_holdout, pred) )
print ("Precision Score (how much of tumours, which were predicted as 'malignant', were actually 'malignant'): ",precision_score(y_holdout, pred))

In [None]:
cm = confusion_matrix(y_holdout, pred)
cm

Let's compare how kNN performs, if we select 3 and 5 closest neighbors:

In [None]:
from matplotlib.colors import ListedColormap

h = .02  # step size in the mesh
weights ='uniform'
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# we only take the first two features: radius_mean and concave points_mean. We could avoid this ugly
# slicing by using a two-dim dataset

for n_neighbors in [3,5]:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X_scaled[:,[0,5]], y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X_scaled[:,0].min() - 1, X_scaled[:,0].max() + 1
    y_min, y_max = X_scaled[:,5].min() - 1, X_scaled[:,5].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X_scaled[:, 0], X_scaled[:, 5], c=y, cmap=cmap_bold,
                edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("2-Class classification (k = %i, weights = '%s')"
              % (n_neighbors, weights))
    plt.xlabel("radius")
    plt.ylabel("concave points")

plt.show()

Or we can compare score on train an test sets for different number of neighbors:

In [None]:
from sklearn.model_selection import cross_val_score

test_scores = []
train_scores = []

for i in range(1,15):

    knn = KNeighborsClassifier(i)
    train_scores.append(cross_val_score(knn, X_scaled_train,y_train,cv=10, scoring='recall').mean())
    test_scores.append(cross_val_score(knn, X_scaled_holdout,y_holdout,cv=10, scoring='recall').mean())
    
plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,15),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,15),test_scores,marker='o',label='Test Score')
plt.xlabel("Neighbours")
plt.ylabel("Recall")

As we can see, the best number of neighbours for the training data is 5.

# Naïve Bayes

In [None]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_scaled_train,y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_scaled_holdout)


In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_holdout, y_pred))
print ("Recall Score (how much of malignant tumours were predicted correctly) : ",recall_score(y_holdout, y_pred) )
print ("Precision Score (how much of tumours, which were predicted as 'malignant', were actually 'malignant'): ",precision_score(y_holdout, y_pred))

Naïve Bayes slightly imporved the accuracy and precision scores.