In [4]:
import numpy as np
import pandas as pd
import os
from collections import Counter

import mode
from mode import process_data,visualize_data,mean_parameter,se_parameter,worst_parameter
from mode import split_model,ttdata

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,auc,roc_curve

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import warnings
warnings.filterwarnings('ignore')

np.random.seed(1)

# Loading the Data

In [5]:
df= pd.read_csv('breast-cancer.csv')

In [6]:
print("Column names in data:", df.columns)

Column names in data: Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')


# Splitting the model for training and testing with a test_size of 20 %

In [12]:
train,test=split_model(df)

train data shape (455, 31)
test data shape (114, 31)


In [13]:
train_X,train_y,test_X,test_y=ttdata(train,test,selected_parameter_mean)

# Logistic Regression Classifier

In [14]:
np.random.seed(1)
clf = LogisticRegression()
clf.fit(train_X,train_y)
prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
mean_acc.append(accuracy_score(prediction,test_y))
mean_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 90.35%
Cross validation score: 89.25% (+/- 7.70%)


# Nearest Neighbors Classifier

In [15]:
np.random.seed(1)
clf = KNeighborsClassifier(n_neighbors=15,p=2,algorithm='kd_tree',leaf_size=20)
clf.fit(train_X,train_y)
prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
mean_acc.append(accuracy_score(prediction,test_y))
mean_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 86.84%
Cross validation score: 90.57% (+/- 5.92%)


# Decision Tree Classifier

In [16]:
np.random.seed(1)
clf = DecisionTreeClassifier(splitter='best')
clf.fit(train_X,train_y)
prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
mean_acc.append(accuracy_score(prediction,test_y))
mean_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 85.09%
Cross validation score: 92.32% (+/- 4.72%)


# Random Forest Classifier

In [17]:
np.random.seed(1)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(train_X,train_y)
prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
mean_acc.append(accuracy_score(prediction,test_y))
mean_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 87.72%
Cross validation score: 93.19% (+/- 2.54%)


# Naive Bayes Classifier

In [18]:
np.random.seed(1)
clf = GaussianNB(var_smoothing=1e-7)
clf.fit(train_X,train_y)
prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
mean_acc.append(accuracy_score(prediction,test_y))
mean_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 89.47%
Cross validation score: 91.66% (+/- 5.59%)


# Splitting the model for training and testing with a test_size of 20 %

In [27]:
train,test=split_model(df)

train data shape (455, 31)
test data shape (114, 31)


In [28]:
train_X,train_y,test_X,test_y=ttdata(train,test,selected_parameter_se)

# Logistic Regression Classifier

In [29]:
np.random.seed(1)
clf = LogisticRegression(tol=1e-4)
clf.fit(train_X,train_y)

prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
se_acc.append(accuracy_score(prediction,test_y))
se_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 89.47%
Cross validation score: 80.89% (+/- 7.50%)


# Nearest Neighbors Classifier

In [30]:
np.random.seed(1)
clf = KNeighborsClassifier(n_neighbors=15,p=2,algorithm='ball_tree',leaf_size=20)
clf.fit(train_X,train_y)

prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
se_acc.append(accuracy_score(prediction,test_y))
se_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 86.84%
Cross validation score: 81.33% (+/- 6.69%)


# Decision Tree Classifier

In [31]:
np.random.seed(1)
clf = DecisionTreeClassifier(splitter='best')
clf.fit(train_X,train_y)

prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
se_acc.append(accuracy_score(prediction,test_y))
se_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 82.46%
Cross validation score: 76.04% (+/- 2.66%)


# Random Forest Classifier

In [32]:
np.random.seed(1)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(train_X,train_y)

prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
se_acc.append(accuracy_score(prediction,test_y))
se_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 83.33%
Cross validation score: 80.22% (+/- 4.40%)


# Naive Bayes Classifier

In [33]:
np.random.seed(1)
clf = GaussianNB(var_smoothing=1e-7)
clf.fit(train_X,train_y)

prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
se_acc.append(accuracy_score(prediction,test_y))
se_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 89.47%
Cross validation score: 76.50% (+/- 5.58%)


# Splitting the model for training and testing with a test_size of 20 %

In [42]:
train,test=split_model(df)

train data shape (455, 31)
test data shape (114, 31)


In [43]:
train_X,train_y,test_X,test_y=ttdata(train,test,selected_parameter_worst)

# Logistic Regression Classifier

In [44]:
np.random.seed(1)
clf = LogisticRegression()
clf.fit(train_X,train_y)

prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
worst_acc.append(accuracy_score(prediction,test_y))
worst_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 95.61%
Cross validation score: 94.07% (+/- 3.26%)


# Nearest Neighbors Classifier

In [45]:
np.random.seed(1)
clf = KNeighborsClassifier(n_neighbors=15,p=2,algorithm='kd_tree',leaf_size=20)
clf.fit(train_X,train_y)

prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
worst_acc.append(accuracy_score(prediction,test_y))
worst_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 93.86%
Cross validation score: 94.95% (+/- 1.69%)


# Decision Tree Classifier

In [46]:
np.random.seed(1)
clf = DecisionTreeClassifier(splitter='best')
clf.fit(train_X,train_y)

prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
worst_acc.append(accuracy_score(prediction,test_y))
worst_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 89.47%
Cross validation score: 94.07% (+/- 2.20%)


# Random Forest Classifier

In [47]:
np.random.seed(1)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(train_X,train_y)

prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
worst_acc.append(accuracy_score(prediction,test_y))
worst_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 94.74%
Cross validation score: 96.26% (+/- 1.76%)


# Naive Bayes Classifier

In [48]:
np.random.seed(1)
clf = GaussianNB(var_smoothing=1e-7)
clf.fit(train_X,train_y)

prediction=clf.predict(test_X)
scores = cross_val_score(clf,train_X,train_y, cv=5)
worst_acc.append(accuracy_score(prediction,test_y))
worst_cvs.append(np.mean(scores))
print("model accuracy: {0:.2%}".format(accuracy_score(prediction, test_y)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
    #print("model accuracy:",metrics.accuracy_score(prediction)

model accuracy: 95.61%
Cross validation score: 95.17% (+/- 2.61%)


In [65]:
print("Accuracy score %f" % accuracy_score(test_y, prediction))
print(classification_report(test_y, prediction))

Accuracy score 0.956140
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        88
           1       0.84      1.00      0.91        26

    accuracy                           0.96       114
   macro avg       0.92      0.97      0.94       114
weighted avg       0.96      0.96      0.96       114

