In [188]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

In [189]:
dataset = pd.read_csv("breast_cancer.csv")

In [190]:
dataset.head(10)

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
5,1017122,8,10,10,8,7,10.0,9,7,1,4
6,1018099,1,1,1,1,2,10.0,3,1,1,2
7,1018561,2,1,2,1,2,1.0,3,1,1,2
8,1033078,2,1,1,1,2,1.0,1,1,5,2
9,1033078,4,2,1,1,2,1.0,2,1,1,2


In [191]:
print("# of patient in the original data: "+str(len(dataset.index)))

# of patient in the original data: 699


In [192]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
Sample_code_number             699 non-null int64
Clump_Thickness                699 non-null int64
Uniformity_of_Cell_Size        699 non-null int64
Uniformity_of_Cell_Shape       699 non-null int64
Marginal_Adhesion              699 non-null int64
Single_Epithelial_Cell_Size    699 non-null int64
Bare_Nuclei                    683 non-null float64
Bland_Chromatin                699 non-null int64
Normal_Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: float64(1), int64(10)
memory usage: 60.1 KB


In [193]:
dataset.isnull()

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False


In [194]:
dataset.isnull().sum()

Sample_code_number              0
Clump_Thickness                 0
Uniformity_of_Cell_Size         0
Uniformity_of_Cell_Shape        0
Marginal_Adhesion               0
Single_Epithelial_Cell_Size     0
Bare_Nuclei                    16
Bland_Chromatin                 0
Normal_Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [195]:
dataset.dropna(inplace=True)

In [196]:
dataset.isnull().sum()

Sample_code_number             0
Clump_Thickness                0
Uniformity_of_Cell_Size        0
Uniformity_of_Cell_Shape       0
Marginal_Adhesion              0
Single_Epithelial_Cell_Size    0
Bare_Nuclei                    0
Bland_Chromatin                0
Normal_Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [197]:
dataset.drop("Sample_code_number", axis=1, inplace=True)

In [198]:
dataset.head(5)

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [199]:
X = dataset.drop("Class", axis=1)
y = dataset["Class"]

In [200]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

In [201]:
modelLRC = LogisticRegression()
modelSVMC  = svm.SVC(kernel='linear')
modelKNNC = KNeighborsClassifier(n_neighbors=3)

In [202]:
modelLRC.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [203]:
modelSVMC.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [204]:
modelKNNC.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [205]:
predictionLRC = modelLRC.predict(X_test)
predictionSVMC = modelSVMC.predict(X_test)
predictionKNNC = modelKNNC.predict(X_test)

In [206]:
print("Confusion Matrix for Logistic Regression: ")
confusion_matrix(y_test,predictionLRC)

Confusion Matrix for Logistic Regression: 


array([[125,   7],
       [  3,  70]], dtype=int64)

In [207]:
print("Confusion Matrix for Support Vector Machine: ")
confusion_matrix(y_test,predictionSVMC)

Confusion Matrix for Support Vector Machine: 


array([[122,  10],
       [  3,  70]], dtype=int64)

In [208]:
print("Confusion Matrix for KNN where (N=3): ")
confusion_matrix(y_test,predictionKNNC)

Confusion Matrix for KNN where (N=3): 


array([[126,   6],
       [  2,  71]], dtype=int64)

In [209]:
LR = accuracy_score(y_test,predictionLRC)*100
SVM = accuracy_score(y_test,predictionSVMC)*100
KNN = accuracy_score(y_test,predictionKNNC)*100

In [210]:
#SVM = accuracy_score(y_test,predictionSVMC)*100

In [211]:
#KNN = accuracy_score(y_test,predictionKNNC)*100

In [214]:
print("Accuracy Chart: ")
print("Logistic Regression: ",LR)
print("Support Vector Machine: ",SVM)
print("K-Nearest Neighbors (where k=3): ",KNN)
#sorted([LR, SVM, KNN], reverse=True)

Accuracy Chart: 
Logistic Regression:  95.1219512195122
Support Vector Machine:  93.65853658536587
K-Nearest Neighbors (where k=3):  96.09756097560975


In [213]:
if((LR>SVM) and (LR>KNN)) :
    print("Logistic Regression has highest accuracy: ", LR)
    if(SVM>KNN):
        print("Support Vectro Machine stands in the middle: ", SVM)
        print("K-Nearest Neighbor has least accuracy: ", KNN)
    else:
        print("K-Nearest Neighbor stands in the middle: ", KNN)
        print("Support Vectro Machine has least accuracy: ", SVM)
elif((SVM>LR) and (SVM>KNN)):
    print("Support Vectro Machine has highest accuracy: ", SVM)
    if(LR>KNN):
        print("Logistic Regression stands in the middle: ", LR)
        print("K-Nearest Neighbor has least accuracy: ", KNN)
    else:
        print("K-Nearest Neighbor stands in the middle: ", KNN)
        print("Logistic Regression has least accuracy: ", LR)
else:
    print("K-Nearest Neighbor has highest accuracy: ", KNN)
    if(LR>SVM):
        print("Logistic Regression stands in the middle: ", LR)
        print("Support Vectro Machine has least accuracy: ", SVM)
    else:
        print("Support Vectro Machine stands in the middle: ", SVM)
        print("Logistic Regression has least accuracy: ", LR)

K-Nearest Neighbor has highest accuracy:  96.09756097560975
Logistic Regression stands in the middle:  95.1219512195122
Support Vectro Machine has least accuracy:  93.65853658536587
