In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [30]:
col=['Id','ClumpThickness','UniformityofCellSize','UniformityofCellShape','MarginalAdhesion',
     'SingleEpithelialCellSize','BareNuclei','BlandChromatin','NormalNucleoli','Mitoses','Class','formalignant']
dat = pd.read_csv('breast-cancer-wisconsin.data',names=col)
dat.pop('formalignant');
dat.head()

Unnamed: 0,Id,ClumpThickness,UniformityofCellSize,UniformityofCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# Data PreProcessing

In [31]:
np.where(dat.isnull())

(array([], dtype=int64), array([], dtype=int64))

In [32]:
dat['BareNuclei'].replace('?',np.NaN,inplace=True)
dat.dropna(inplace=True)

In [33]:
dat['Class'] = dat['Class'] / 2 - 1
dat.head()

Unnamed: 0,Id,ClumpThickness,UniformityofCellSize,UniformityofCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0.0
1,1002945,5,4,4,5,7,10,3,2,1,0.0
2,1015425,3,1,1,1,2,2,3,1,1,0.0
3,1016277,6,8,8,1,3,4,3,7,1,0.0
4,1017023,4,1,1,3,2,1,3,1,1,0.0


In [34]:
X = dat.drop(['Id','Class'],axis=1)
X_col = X.columns

In [36]:
y = dat['Class']

In [49]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score,cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [41]:
X = StandardScaler().fit_transform(X.values)

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [45]:
df1 = pd.DataFrame(X,columns=X_col)
df1.head()

Unnamed: 0,ClumpThickness,UniformityofCellSize,UniformityofCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses
0,0.197905,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484
1,0.197905,0.277252,0.262783,0.758032,1.695166,1.772867,-0.181827,-0.285105,-0.3484
2,-0.511643,-0.702212,-0.741774,-0.639366,-0.555608,-0.424217,-0.181827,-0.612927,-0.3484
3,0.552679,1.583204,1.602192,-0.639366,-0.105454,0.125054,-0.181827,1.354008,-0.3484
4,-0.156869,-0.702212,-0.741774,0.059333,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484


In [46]:
X_train,X_test,y_train,y_test = train_test_split(df1,y,train_size=0.7,random_state=42)



In [47]:
knn = KNeighborsClassifier(n_neighbors=5,p=2,metric='minkowski')

In [48]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [50]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))

In [51]:
print_score(knn, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9749

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.98      0.98      0.98       317
        1.0       0.96      0.97      0.96       161

avg / total       0.98      0.97      0.97       478


Confusion Matrix: 
 [[310   7]
 [  5 156]]

Average Accuracy: 	 0.9644
Accuracy SD: 		 0.0189


In [52]:
print_score(knn, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.9561

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.95      0.98      0.97       127
        1.0       0.97      0.91      0.94        78

avg / total       0.96      0.96      0.96       205


Confusion Matrix: 
 [[125   2]
 [  7  71]]



# Grid Search

In [53]:
from sklearn.grid_search import GridSearchCV

In [54]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [57]:
params = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]}
grid_search_cv = GridSearchCV(KNeighborsClassifier(),params,n_jobs=-1,verbose=1)

In [58]:
grid_search_cv.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [59]:
grid_search_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [60]:
print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9707

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.98      0.98      0.98       317
        1.0       0.96      0.96      0.96       161

avg / total       0.97      0.97      0.97       478


Confusion Matrix: 
 [[310   7]
 [  7 154]]

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


Average Accuracy: 	 0.9644
Accuracy SD: 		 0.0231


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished


In [61]:
print_score(grid_search_cv, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.9561

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.95      0.98      0.97       127
        1.0       0.97      0.91      0.94        78

avg / total       0.96      0.96      0.96       205


Confusion Matrix: 
 [[125   2]
 [  7  71]]



# XGB

In [62]:
import xgboost as xgb

In [63]:
clf = xgb.XGBClassifier()

In [64]:
clf.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [65]:
print_score(clf, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9916

Classification Report: 
              precision    recall  f1-score   support

        0.0       1.00      0.99      0.99       317
        1.0       0.98      0.99      0.99       161

avg / total       0.99      0.99      0.99       478


Confusion Matrix: 
 [[314   3]
 [  1 160]]

Average Accuracy: 	 0.9562
Accuracy SD: 		 0.0195


In [66]:
print_score(clf, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.9512

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.94      0.98      0.96       127
        1.0       0.97      0.90      0.93        78

avg / total       0.95      0.95      0.95       205


Confusion Matrix: 
 [[125   2]
 [  8  70]]



# SVM

In [67]:
from sklearn import svm

In [69]:
svm_clf = svm.SVC(kernel='rbf')
svm_clf.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [70]:
print_score(svm_clf, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9770

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.98      0.98      0.98       317
        1.0       0.96      0.97      0.97       161

avg / total       0.98      0.98      0.98       478


Confusion Matrix: 
 [[311   6]
 [  5 156]]

Average Accuracy: 	 0.9686
Accuracy SD: 		 0.0234


In [71]:
print_score(svm_clf, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.9610

Classification Report: 
              precision    recall  f1-score   support

        0.0       0.96      0.98      0.97       127
        1.0       0.96      0.94      0.95        78

avg / total       0.96      0.96      0.96       205


Confusion Matrix: 
 [[124   3]
 [  5  73]]

