# Classifying Malignant Tumors

In [279]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import seaborn as sns
sns.set()

   #  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)

In [None]:
##Since data did not have headers, we will create the headers here and sort so the Target variable is last

In [226]:
col = ['id', 'Clump Thickness', 'Uniformity of Cell Size', 
       'Uniformity of Cell Shape', 'Marginal Adhesion', 
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class']

In [None]:
##Load the Data

In [228]:
raw_data = pd.read_csv('/Users/p-73/Documents/Data Science/Cancer Detection/0805/breast-cancer-wisconsin.data.csv', 
                       names = col, header=None)

In [None]:
##Copy the raw data to a new dataframe

In [127]:
df = raw_data.copy()

In [128]:
df.describe()

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                             699 non-null int64
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null object
Bland Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [None]:
## The Bare Nuclei field has type object, it should be an int64 type

In [229]:
df['Bare Nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

In [239]:
df['Bare Nuclei'].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64

In [None]:
## It contains 16 '?' values
## We will drop these rows
## We also want to remap the 2, 4 labels to 0 and 1
## Finally, we will drop the id column

In [132]:
df['Class'] = df['Class'].map({2: 0, 4:1})

In [136]:
df = df.drop(['id'],axis=1)

In [137]:
df_clean = df[df['Bare Nuclei'] != '?']

In [None]:
## Select the features and targets into separate variables

In [138]:
features = df_clean.iloc[:,:-1]

In [241]:
targets = df_clean.iloc[:,-1:]

In [None]:
## Split the data into training and test sets

In [144]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.33, random_state=42)

In [None]:
## Scale the features

In [162]:
scaler = preprocessing.StandardScaler()

In [163]:
X_train_scaled = scaler.fit_transform(X_train)

In [164]:
X_test_scaled = scaler.transform(X_test)

# KNN

In [171]:
knn_model = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')

In [251]:
knn_model.fit(X_train_scaled, y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [252]:
# compute classification accuracy
from sklearn import metrics
y_pred = knn_model.predict(X_test_scaled)
print(metrics.accuracy_score(y_test, y_pred))

0.9557522123893806


In [253]:
## Find optimal hyperparameters

In [254]:
from sklearn.model_selection import GridSearchCV

In [255]:
knn_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [256]:
params = {'n_neighbors': range(1,11,1)}

In [257]:
grid_search_cv = GridSearchCV(KNeighborsClassifier(),
                             params,
                             n_jobs=-1,
                             verbose=1)

In [258]:
grid_search_cv.fit(X_train_scaled, y_train.values.ravel())

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    1.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.4s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1, param_grid={'n_neighbors': range(1, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [259]:
grid_search_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

# SVM

In [260]:
from sklearn import svm

In [261]:
svm_model = svm.SVC(kernel='rbf')

In [262]:
svm_model.fit(X_train_scaled, y_train.values.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [263]:
y_pred = svm_model.predict(X_test_scaled)
print(metrics.accuracy_score(y_test, y_pred))

0.9557522123893806


In [264]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest

In [265]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5,
                       random_state=42)

In [266]:
rf_model.fit(X_train_scaled, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [267]:
y_pred = rf_model.predict(X_test_scaled)
print(metrics.accuracy_score(y_test, y_pred))

0.9734513274336283


In [268]:
rf_model.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [269]:
rf_params = {'max_depth': range(3,6,1), 'min_samples_split':[2,3,4], 'n_estimators': [10,30,60,100]}

In [270]:
rf_grid_search = GridSearchCV(RandomForestClassifier(),
                             rf_params, verbose=1)

In [272]:
rf_grid_search.fit(X_train_scaled, y_train.values.ravel())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:    3.5s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

In [273]:
rf_grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [274]:
y_pred = rf_grid_search.predict(X_test_scaled)
print(metrics.accuracy_score(y_test, y_pred))

0.9601769911504425


# XGBoost

In [275]:
from xgboost import XGBClassifier

In [276]:
xgb_model = XGBClassifier()

In [277]:
xgb_model.fit(X_train_scaled, y_train.values.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [278]:
y_pred = xgb_model.predict(X_test_scaled)
print(metrics.accuracy_score(y_test, y_pred))

0.9513274336283186


# Random Forest Model has the best result at 0.96