In [259]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# 01. Loads the data and creates a Pandas DataFrame

In [260]:
dataset = pd.read_csv("breast_cancer_data.csv")

# 02. Prints the number of features in the dataset.

In [261]:
count = 0
for col in dataset.columns:
    print(col)
    count+=1
    
print("\n#Number of features: ", count)

id
diagnosis
radius_mean
texture_mean
perimeter_mean
area_mean
smoothness_mean
compactness_mean
concavity_mean
concave points_mean
symmetry_mean
fractal_dimension_mean
radius_se
texture_se
perimeter_se
area_se
smoothness_se
compactness_se
concavity_se
concave points_se
symmetry_se
fractal_dimension_se
radius_worst
texture_worst
perimeter_worst
area_worst
smoothness_worst
compactness_worst
concavity_worst
concave points_worst
symmetry_worst
fractal_dimension_worst
Unnamed: 32

#Number of features:  33


# 03. Number of Samples in each class

In [262]:
dataset['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

# Data Wrangling

In [263]:
dataset.isnull().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

In [264]:
dataset = dataset.dropna(axis=1)

In [265]:
dataset.drop("id", axis=1, inplace=True)

# 04. Randomly splits the dataset into Training and Test set

In [266]:
X = dataset.drop("diagnosis", axis=1)
y = dataset["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 05. Train different classifiers

In [267]:
def models(X_train,Y_train):
    #Logistic Regression
    modelLRC = LogisticRegression()
    modelLRC.fit(X_train, Y_train)
    
    #KNN
    modelKNNC = KNeighborsClassifier(n_neighbors = 3)
    modelKNNC.fit(X_train, Y_train)
    
    #SVM
    modelSVMC = SVC(kernel = 'linear')
    modelSVMC.fit(X_train, Y_train)
    
    #Decision Tree
    modelDTC = DecisionTreeClassifier()
    modelDTC.fit(X_train, Y_train)
    
    #Naive Bayes
    modelNBC = GaussianNB()
    modelNBC.fit(X_train, Y_train)
    
    return modelLRC, modelKNNC, modelSVMC, modelDTC, modelNBC

In [268]:
model = models(X_train,y_train)

# 06.  Predict the class for each test sample.

In [269]:
for i in range(len(model)):
    print('Model', i)
    prediction = model[i].predict(X_test)
    print(prediction)
    print()

Model 0
['M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'M'
 'M' 'M' 'M' 'M' 'B' 'B' 'M' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'B' 'M' 'B' 'M'
 'B' 'M' 'B' 'M' 'M' 'B' 'M' 'B' 'M' 'M' 'B' 'B' 'B' 'M' 'M' 'M' 'M' 'B'
 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'M' 'B' 'B' 'M' 'B' 'M' 'M' 'M' 'B' 'M' 'M'
 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'M' 'B' 'M' 'B' 'B' 'B' 'M' 'M'
 'B' 'M' 'M' 'M' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'B'
 'M' 'M' 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'B'
 'M' 'B' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'B' 'M' 'B'
 'B' 'M' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'M' 'B'
 'B' 'M' 'B' 'M' 'M' 'M' 'B' 'B' 'B']

Model 1
['M' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'M' 'M' 'M' 'B' 'M'
 'M' 'M' 'M' 'M' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'B' 'M'
 'B' 'M' 'B' 'M' 'M' 'B' 'M' 'B' 'M' 'M' 'B' 'B' 'B' 'M' 'M' 'B' 'M' 'B'
 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'M' 'B' 'B' 'M' 'B' 'M' 'M' 'M' 'B' 'B' 

# 07. Measure the accuracy of each classifier and compare them.

In [270]:
modelLRC = LogisticRegression()
modelKNNC = KNeighborsClassifier(n_neighbors = 3)
modelSVMC = SVC(kernel = 'linear')
modelDTC = DecisionTreeClassifier()
modelNBC = GaussianNB()
    
modelLRC.fit(X_train, y_train)
modelKNNC.fit(X_train, y_train)
modelSVMC.fit(X_train, y_train)
modelDTC.fit(X_train, y_train)
modelNBC.fit(X_train, y_train)

LR = modelLRC.score(X_train, y_train)*100
KNN = modelKNNC.score(X_train, y_train)*100
SVM = modelSVMC.score(X_train, y_train)*100
DT = modelDTC.score(X_train, y_train)*100
NB = modelNBC.score(X_train, y_train)*100

In [271]:
for i in range(len(model)):
    cnf_matrix = confusion_matrix(y_test, model[i].predict(X_test))
    print('Model :', i)
    print(cnf_matrix)
    print()

Model : 0
[[103   5]
 [  1  62]]

Model : 1
[[101   7]
 [  7  56]]

Model : 2
[[103   5]
 [  2  61]]

Model : 3
[[98 10]
 [ 4 59]]

Model : 4
[[101   7]
 [  6  57]]



In [272]:
print('#Accuracy chart: ')
print()
print('[0]Logistic Regression Training Accuracy:', LR)
print('[1]K Nearest Neighbor Training Accuracy', KNN)
print('[2]Support Vector Machine Training Accuracy:', SVM)
print('[3]Decision Tree Training Accuracy:', DT)
print('[4]Naive Bayes Training Accuracy:', NB)

#Accuracy chart: 

[0]Logistic Regression Training Accuracy: 95.7286432160804
[1]K Nearest Neighbor Training Accuracy 95.22613065326632
[2]Support Vector Machine Training Accuracy: 96.4824120603015
[3]Decision Tree Training Accuracy: 100.0
[4]Naive Bayes Training Accuracy: 94.22110552763819


In [273]:
if((LR>SVM) and (LR>KNN) and (LR>DT) and (LR>NB)) :
    print("Logistic Regression has highest accuracy: ", LR)
    
elif((KNN>SVM) and (KNN>LR) and (KNN>DT) and (KNN>NB)):
    print("K-Nearest Neighbor has highest accuracy: ", KNN)
    
elif((SVM>LR) and (SVM>KNN) and (SVM>DT) and (SVM>NB)):
    print("Support Vector Machine has highest accuracy: ", SVM)
    
elif((DT>LR) and (DT>KNN) and (DT>SVM) and (DT>NB)):
    print("Decision Tree has highest accuracy: ", DT)

else:
    print("Naive Bayes has highest accuracy: ", NB)

Decision Tree has highest accuracy:  100.0
