# Project Cancer Detection

## Breast Cancer Winconsin (Disgnotic) Data set
[Source:UCI](http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29)

[Data Set info](http://archive.ics.uci.edu/ml/machine-learning-database/breast-cancer-wisconsin/breast-cancer-wisconsin.names)

In [1]:
import numpy as np
import pandas as pd

In [2]:
col = ['id','Clump Thickness','Uniformity of Cellsize',
      'Uniformity of Cell Shape','Marginal Adhesion',
      'Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin',
      'Normal Nucleoli','Mitoses','Class']
df = pd.read_csv("breast-cancer-wisconsin.data",names=col,
                 header=None)
df.head()

Unnamed: 0,id,Clump Thickness,Uniformity of Cellsize,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# Data Pre-processing

In [3]:
np.where(df.isnull())

(array([], dtype=int64), array([], dtype=int64))

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cellsize       699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Bland Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitoses                      699 non-null    int64 
 10  Class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [5]:
df['Bare Nuclei'].describe()

count     699
unique     11
top         1
freq      402
Name: Bare Nuclei, dtype: object

In [6]:
df['Bare Nuclei'].value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64

In [7]:
df['Bare Nuclei']=='?'

0      False
1      False
2      False
3      False
4      False
       ...  
694    False
695    False
696    False
697    False
698    False
Name: Bare Nuclei, Length: 699, dtype: bool

how do we drop the?

In [8]:
df[df['Bare Nuclei']=='?']

Unnamed: 0,id,Clump Thickness,Uniformity of Cellsize,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [9]:
df['Class'].value_counts()

2    458
4    241
Name: Class, dtype: int64

In [10]:
df['Bare Nuclei'].replace("?",np.NAN,inplace=True)
df = df.dropna()

Note that for class:2 is benign,4 is for malignant

$$\frac{\text{df["class"]}}{2}-1$$

In [11]:
df['Bare Nuclei'].value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64

In [12]:
df['Class']=df['Class']/2-1   # here we are trying to assign class value 1&0

In [13]:
df['Class'].value_counts()

0.0    444
1.0    239
Name: Class, dtype: int64

In [14]:
df.columns

Index(['id', 'Clump Thickness', 'Uniformity of Cellsize',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class'],
      dtype='object')

In [15]:
X=df.drop(['id','Class'],axis=1)

In [16]:
X_col = X.columns
X_col

Index(['Clump Thickness', 'Uniformity of Cellsize', 'Uniformity of Cell Shape',
       'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei',
       'Bland Chromatin', 'Normal Nucleoli', 'Mitoses'],
      dtype='object')

In [17]:
y = df['Class']

**Training**

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
X = StandardScaler().fit_transform(X.values)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
df1 = pd.DataFrame(X,columns=X_col)

In [22]:
df1.head()

Unnamed: 0,Clump Thickness,Uniformity of Cellsize,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,0.197905,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484
1,0.197905,0.277252,0.262783,0.758032,1.695166,1.772867,-0.181827,-0.285105,-0.3484
2,-0.511643,-0.702212,-0.741774,-0.639366,-0.555608,-0.424217,-0.181827,-0.612927,-0.3484
3,0.552679,1.583204,1.602192,-0.639366,-0.105454,0.125054,-0.181827,1.354008,-0.3484
4,-0.156869,-0.702212,-0.741774,0.059333,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484


In [23]:
y.shape

(683,)

In [24]:
X_train,X_test,y_train,y_test = train_test_split(df1,y,train_size=.8,
                                                random_state=42)

In [25]:
from sklearn.preprocessing import MinMaxScaler
pd.DataFrame(MinMaxScaler().fit_transform(df.drop(['id','Class'],axis=1).values),columns=X_col).head()

Unnamed: 0,Clump Thickness,Uniformity of Cellsize,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,0.444444,0.0,0.0,0.0,0.111111,0.0,0.222222,0.0,0.0
1,0.444444,0.333333,0.333333,0.444444,0.666667,1.0,0.222222,0.111111,0.0
2,0.222222,0.0,0.0,0.0,0.111111,0.111111,0.222222,0.0,0.0
3,0.555556,0.777778,0.777778,0.0,0.222222,0.333333,0.222222,0.666667,0.0
4,0.333333,0.0,0.0,0.222222,0.111111,0.0,0.222222,0.0,0.0


# choosing kNN model

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
knn = KNeighborsClassifier(n_neighbors=5,
                          p=2, metric = 'minkowski')
#conversion of minkowski distance to eucladian dis.

In [28]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [29]:
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix 

In [30]:
def print_score(clf,X_train,X_test,y_train,y_test,train=True):
    '''
    print the accuracy score,classification report ,and confusion matrix'''
    if train:
        '''
        training per formance'''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train,clf.predict(X_train))))
        print("Classifier_report:\n {} \n".format(classification_report(y_train,clf.predict(X_train))))
        print("Confusion_matrix: \n {} \n".format(confusion_matrix(y_train,clf.predict(X_train))))
        
        res=cross_val_score(clf,X_train,y_train,cv=10,scoring='accuracy')
        print('Average Accuracy: \t {0:.4f}'.format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
    elif train==False:
        '''
        test performance'''
        print("Test Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test,clf.predict(X_test))))
        print("Classifier_report:\n {} \n".format(classification_report(y_test,clf.predict(X_test))))
        print("Confusion_matrix: \n {} \n".format(confusion_matrix(y_test,clf.predict(X_test))))
        
        
        

In [31]:
print_score(knn,X_train,X_test,y_train,y_test,train=True)


Train Result:

accuracy score: 0.9725

Classifier_report:
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       365
         1.0       0.96      0.96      0.96       181

    accuracy                           0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546
 

Confusion_matrix: 
 [[358   7]
 [  8 173]] 

Average Accuracy: 	 0.9634
Accuracy SD: 		 0.0200


In [32]:
print_score(knn,X_train,X_test,y_train,y_test,train=False)


Test Result:

accuracy score: 0.9562

Classifier_report:
               precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        79
         1.0       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137
 

Confusion_matrix: 
 [[78  1]
 [ 5 53]] 



# Grid Search

In [33]:
from sklearn.model_selection import GridSearchCV

In [34]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [35]:
params = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]}

In [36]:
grid_search_cv = GridSearchCV(KNeighborsClassifier(),params,
                             n_jobs=-1,verbose=1)

In [37]:
grid_search_cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.1s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [38]:
grid_search_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                     weights='uniform')

In [39]:
print_score(grid_search_cv,X_train,X_test,y_train,y_test,train=True)


Train Result:

accuracy score: 0.9707

Classifier_report:
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       365
         1.0       0.96      0.95      0.96       181

    accuracy                           0.97       546
   macro avg       0.97      0.97      0.97       546
weighted avg       0.97      0.97      0.97       546
 

Confusion_matrix: 
 [[358   7]
 [  9 172]] 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  43 out of  50 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Average Accuracy: 	 0.9634
Accuracy SD: 		 0.0231


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished


In [40]:
print_score(grid_search_cv,X_train,X_test,y_train,y_test,train=False)


Test Result:

accuracy score: 0.9489

Classifier_report:
               precision    recall  f1-score   support

         0.0       0.93      0.99      0.96        79
         1.0       0.98      0.90      0.94        58

    accuracy                           0.95       137
   macro avg       0.95      0.94      0.95       137
weighted avg       0.95      0.95      0.95       137
 

Confusion_matrix: 
 [[78  1]
 [ 6 52]] 



In [41]:
grid_search_cv.best_params_

{'n_neighbors': 8}

In [42]:
grid_search_cv.cv_results_

{'mean_fit_time': array([0.00757504, 0.00878839, 0.00692391, 0.00773091, 0.01360316,
        0.00736504, 0.00469604, 0.0057086 , 0.00569329, 0.00892906]),
 'std_fit_time': array([0.00187863, 0.0049952 , 0.00675511, 0.00286046, 0.01805347,
        0.00334513, 0.00367321, 0.00314355, 0.00221199, 0.00241107]),
 'mean_score_time': array([0.01386733, 0.0159255 , 0.01795731, 0.01995616, 0.01752338,
        0.01509833, 0.01924515, 0.01640821, 0.0166285 , 0.01479092]),
 'std_score_time': array([0.00251036, 0.00376547, 0.00606049, 0.00658767, 0.00768004,
        0.00427413, 0.00625534, 0.00336244, 0.00397705, 0.00421603]),
 'param_n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 1},
  {'n_neighbors': 2},
  {'n_neighbors': 3},
  {'n_neighbors': 4},
  {'n_neighbors': 5},
  {'n_neighbors': 6},


SVM,Random Forest,XGBoost

In [43]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
print_score(clf,X_train,X_test,y_train,y_test,train=True)
print_score(clf,X_train,X_test,y_train,y_test,train=False)


Train Result:

accuracy score: 0.9799

Classifier_report:
               precision    recall  f1-score   support

         0.0       0.99      0.98      0.98       365
         1.0       0.96      0.98      0.97       181

    accuracy                           0.98       546
   macro avg       0.98      0.98      0.98       546
weighted avg       0.98      0.98      0.98       546
 

Confusion_matrix: 
 [[358   7]
 [  4 177]] 

Average Accuracy: 	 0.9634
Accuracy SD: 		 0.0244
Test Result:

accuracy score: 0.9635

Classifier_report:
               precision    recall  f1-score   support

         0.0       0.96      0.97      0.97        79
         1.0       0.96      0.95      0.96        58

    accuracy                           0.96       137
   macro avg       0.96      0.96      0.96       137
weighted avg       0.96      0.96      0.96       137
 

Confusion_matrix: 
 [[77  2]
 [ 3 55]] 



In [44]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train,y_train)
print_score(rf_clf,X_train,X_test,y_train,y_test,train=True)
print_score(rf_clf,X_train,X_test,y_train,y_test,train=False)


Train Result:

accuracy score: 1.0000

Classifier_report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       365
         1.0       1.00      1.00      1.00       181

    accuracy                           1.00       546
   macro avg       1.00      1.00      1.00       546
weighted avg       1.00      1.00      1.00       546
 

Confusion_matrix: 
 [[365   0]
 [  0 181]] 

Average Accuracy: 	 0.9634
Accuracy SD: 		 0.0200
Test Result:

accuracy score: 0.9562

Classifier_report:
               precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        79
         1.0       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137
 

Confusion_matrix: 
 [[78  1]
 [ 5 53]] 



In [45]:
import xgboost as xgb
xg_clf = xgb.XGBClassifier()
xg_clf.fit(X_train,y_train)
print_score(xg_clf,X_train,X_test,y_train,y_test,train=True)
print_score(xg_clf,X_train,X_test,y_train,y_test,train=False)


Train Result:

accuracy score: 1.0000

Classifier_report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       365
         1.0       1.00      1.00      1.00       181

    accuracy                           1.00       546
   macro avg       1.00      1.00      1.00       546
weighted avg       1.00      1.00      1.00       546
 

Confusion_matrix: 
 [[365   0]
 [  0 181]] 

Average Accuracy: 	 0.9653
Accuracy SD: 		 0.0237
Test Result:

accuracy score: 0.9562

Classifier_report:
               precision    recall  f1-score   support

         0.0       0.94      0.99      0.96        79
         1.0       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137
 

Confusion_matrix: 
 [[78  1]
 [ 5 53]] 



In [46]:
y.dtype

dtype('float64')