In [2]:
#Prediction of Breast Cancer using SVM with 99% accuracy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import time

data = pd.read_csv('F:\MachineLearning\FTSamples-PR\RSamples\Dataset-c.csv', 
                   index_col=False)
data.head(5)

print(data.shape)
data.describe()
#Data visualisation and pre-processing
#First thing to do is to enumerate the diagnosis column such that M = 1, B = 0.
#Then, I set the ID column to be the index of the dataframe.
#Afterall, the ID column will not be used for machine learning
data['diagnosis'] = data['diagnosis'].apply(lambda x: '1' if x == 'M' else '0')
data = data.set_index('id')
print(data)

print(data.groupby('diagnosis').size())

y = data['diagnosis'].values
X = data.drop('diagnosis', axis=1).values

X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.20, random_state=0)

#Baseline algorithm checking
models_list = []
models_list.append(('CART', DecisionTreeClassifier(criterion='entropy')))
models_list.append(('SVM', SVC()))
models_list.append(('NB', GaussianNB()))
models_list.append(('KNN', KNeighborsClassifier()))

num_folds = 10
results = []
names = []

for name, model in models_list:
    kfold = KFold(n_splits=num_folds, random_state=0)
    start = time.time()
    cv_results = cross_val_score(model, X_train, y_train,
                                 cv=kfold, scoring='accuracy')
    end = time.time()
    results.append(cv_results)
    names.append(name)
    #print(model)
    print("%s: %f (%f) (run time: %f)" % (name, cv_results.mean(), cv_results.std(), end-start))

#From the initial run, it looks like GaussianNB, KNN and CART performed the best given the dataset (all above 92% mean accuracy).
#Support Vector Machine has a surprisingly bad performance here.
#However, if we standardise the input dataset, it's performance should improve.

#Evaluation of algorithm on Standardised Data
#The performance of the few machine learning algorithm could be 
#improved if a standardised dataset is being used.
#The improvement is likely for all the models.
#pipelines that standardize the data and 
#build the model for each fold in the cross-validation test harness
pipelines = []

pipelines.append(('ScaledCART', 
                  Pipeline([('Scaler', StandardScaler()),('CART',DecisionTreeClassifier())])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC( ))])))
# pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB',
#                                                                       GaussianNB())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN',
                                                                       KNeighborsClassifier())])))
results = []
names = []
kfold = KFold(n_splits=num_folds, random_state=0)
for name, model in pipelines:
        start = time.time()
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        end = time.time()
        results.append(cv_results)
        names.append(name)
        print( "%s: %f (%f) (run time: %f)" % (name, cv_results.mean(), cv_results.std(), end-start))

#Algorithm Tuning - Tuning SVM
#We can tune two key parameter of the SVM algorithm - 
#the value of C and the type of kernel.
# The default C for SVM is 1.0 and the kernel is Radial Basis Function (RBF).
#We will use the grid search method using 10-fold cross-validation 
#with a standardized copy of the sample training dataset.
#We will try over a combination of C values and the following kernel types 'linear', 'poly', 'rbf' and 'sigmoid
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = dict(C=c_values, kernel=kernel_values)
model = SVC()
kfold = KFold(n_splits=num_folds, random_state=0)
grid = GridSearchCV(estimator=model, param_grid=param_grid,scoring='accuracy', 
                    cv=kfold)
grid_result = grid.fit(rescaledX, y_train)
print("Best: %f using %s" % (grid_result.best_score_,grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
model = SVC(C=2.0, kernel='rbf')
start = time.time()
model.fit(X_train_scaled, y_train)
end = time.time()
print( "Run Time: %f" % (end-start))

# estimate accuracy on test dataset
X_test_scaled = scaler.transform(X_test)
predictions = model.predict(X_test_scaled)
print("Accuracy score %f" % accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

(569, 32)
         diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
id                                                                         
842302           1       17.990         10.38          122.80     1001.0   
842517           1       20.570         17.77          132.90     1326.0   
84300903         1       19.690         21.25          130.00     1203.0   
84348301         1       11.420         20.38           77.58      386.1   
84358402         1       20.290         14.34          135.10     1297.0   
843786           1       12.450         15.70           82.57      477.1   
844359           1       18.250         19.98          119.60     1040.0   
84458202         1       13.710         20.83           90.20      577.9   
844981           1       13.000         21.82           87.50      519.8   
84501001         1       12.460         24.04           83.97      475.9   
845636           1       16.020         23.24          102.70      797.8   
84

CART: 0.923237 (0.038003) (run time: 0.099128)
SVM: 0.637101 (0.050791) (run time: 0.240753)
NB: 0.945121 (0.034079) (run time: 0.016639)
KNN: 0.925266 (0.031306) (run time: 0.024442)
ScaledCART: 0.918841 (0.037656) (run time: 0.090332)
ScaledSVM: 0.975894 (0.024847) (run time: 0.056029)
ScaledKNN: 0.967053 (0.022492) (run time: 0.048453)
Best: 0.980220 using {'C': 0.5, 'kernel': 'linear'}
0.978022 (0.021860) with: {'C': 0.1, 'kernel': 'linear'}
0.828571 (0.039198) with: {'C': 0.1, 'kernel': 'poly'}
0.945055 (0.031128) with: {'C': 0.1, 'kernel': 'rbf'}
0.947253 (0.024308) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.975824 (0.024933) with: {'C': 0.3, 'kernel': 'linear'}
0.872527 (0.043785) with: {'C': 0.3, 'kernel': 'poly'}
0.958242 (0.024765) with: {'C': 0.3, 'kernel': 'rbf'}
0.956044 (0.023861) with: {'C': 0.3, 'kernel': 'sigmoid'}
0.980220 (0.022960) with: {'C': 0.5, 'kernel': 'linear'}
0.885714 (0.030900) with: {'C': 0.5, 'kernel': 'poly'}
0.969231 (0.024347) with: {'C': 0.5, 'kernel': 