
# Breast Cancer Classifier
### from 'Breast Cancer Wisconsin (Original) Data Set'


## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [7]:
dataset = pd.read_csv('breast_cancer.csv')
x = dataset.iloc[: , 1:-1].values
y = dataset.iloc[: , -1].values
print(x[0:5])
print(y[0:5])
# Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
# Classification: benign=2, malignant=4
# no missing data here

[[ 5  1  1  1  2  1  3  1  1]
 [ 5  4  4  5  7 10  3  2  1]
 [ 3  1  1  1  2  2  3  1  1]
 [ 6  8  8  1  3  4  3  7  1]
 [ 4  1  1  3  2  1  3  1  1]]
[2 2 2 2 2]


## Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
(x_train, x_test, y_train, y_test) = train_test_split(x, y, test_size=0.20, random_state=0)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(546, 9)
(137, 9)
(546,)
(137,)


## Training the Logistic Regression model on the Training set

In [12]:
from sklearn.linear_model import LogisticRegression
lgClf = LogisticRegression(random_state=0)
lgClf.fit(x_train, y_train)

LogisticRegression(random_state=0)

## Predicting the Test set results

In [14]:
y_pred = lgClf.predict(x_test)
print(y_pred[0:5])
print(y_test[0:5])

[2 2 4 4 2]
[2 2 4 4 2]


## Making the Confusion Matrix

In [38]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
cMatrix = confusion_matrix(y_test, y_pred)
accSc = accuracy_score(y_test, y_pred)
presSc = precision_score(y_test, y_pred, pos_label=4)
recSc = recall_score(y_test, y_pred, pos_label=4)
f1Sc = f1_score(y_test, y_pred, pos_label=4)
# f1 score will consider both recall and precision scores
# [ tp | fp ]
# [ fn | tn ]
print("Confusion Matrix:\n" , cMatrix)
print("Accuracy:\n" , accSc)
print("Precision:\n" , presSc)
print("Recall:\n" , recSc)
print("F1 Score:\n" , f1Sc)

Confusion Matrix:
 [[84  3]
 [ 3 47]]
Accuracy:
 0.9562043795620438
Precision:
 0.94
Recall:
 0.94
F1 Score:
 0.94


## Computing the accuracy with k-Fold Cross Validation

In [39]:
from sklearn.model_selection import cross_val_score
# 10 Fold Cross Validation is common standard
# since this is checking the same fitting from lgClf, 
# mean and standard dev will be checked to measure accuracy
accuracies = cross_val_score(estimator=lgClf, X=x_train, y=y_train, cv=10)
print("Accuracies: \n", accuracies)
print("Mean Acc: \n", accuracies.mean())
print("Standard Deviation of Acc: \n", accuracies.std())

Accuracies: 
 [0.94545455 0.96363636 0.96363636 1.         0.94545455 1.
 0.96296296 0.96296296 0.98148148 0.94444444]
Mean Acc: 
 0.967003367003367
Standard Deviation of Acc: 
 0.019697976894447813
