<div>
    <img src="https://storage.googleapis.com/kaggle-datasets-images/180/384/3da2510581f9d3b902307ff8d06fe327/dataset-cover.jpg" />
</div>

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

<h1 id="dataset" style="color:#5f20aa; background:white; border:0.5px dotted #5f20aa;"> 
    <center>Dataset
        <a class="anchor-link" href="#dataset" target="_self">¶</a>
    </center>
</h1>

## Load dataset

In [None]:
path = "../input/breast-cancer-wisconsin-data/data.csv"
df = pd.read_csv(path)
df = shuffle(df)
df.head()

## Drop non-useful columns

In [None]:
df.drop('id', axis=1, inplace=True)
df.drop('Unnamed: 32', axis=1, inplace=True)

## Categorize cancer malignant or benign

In [None]:
categories = {'B':0, 'M':1}
df['diagnosis'].replace(categories, inplace=True)
df.head()

## Columns information

In [None]:
df.describe().T

In [None]:
# verify if data has any null values
df.isnull().values.any()

## Correlation heatmap

In [None]:
plt.figure(figsize=(16,8))
corr = df.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True)

In [None]:
# select only the strong correlated columns with diagnosis
best_correlated = list(corr['diagnosis'][corr['diagnosis'] > 0.7].index)
print(best_correlated)

In [None]:
features = df[best_correlated[1:]]
labels = df['diagnosis'].values

# Min-Max Scaler to normalize columns

In [None]:
scaler = MinMaxScaler()
scaler.fit(features)
features = scaler.transform(features)

## Train/Test split

In [None]:
# use of batch size of 8
limit = 8*8*8

X_train, X_test = features[:limit], features[limit:]
y_train, y_test = labels[:limit], labels[limit:]

<h1 id="loss" style="color:#5f20aa; background:white; border:0.5px dotted #5f20aa;"> 
    <center>Loss function
        <a class="anchor-link" href="#loss" target="_self">¶</a>
    </center>
</h1>

In [None]:
def cross_entropy(y_hat, y):
    if y == 1:
        return -np.log(y_hat)
    else:
        return -np.log(1 - y_hat)
    
def dcross_entropy(y_hat, y):
    if y == 1:
        return -1/y_hat
    else:
        return 1 / (1 - y_hat)

<h1 id="activation" style="color:#5f20aa; background:white; border:0.5px dotted #5f20aa;"> 
    <center>Activation function
        <a class="anchor-link" href="#activation" target="_self">¶</a>
    </center>
</h1>

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def dsigmoid(x):
    return sigmoid(x) * (1 - sigmoid(x))

<h1 id="training" style="color:#5f20aa; background:white; border:0.5px dotted #5f20aa;"> 
    <center>Training
        <a class="anchor-link" href="#training" target="_self">¶</a>
    </center>
</h1>

## Parameters

In [None]:
lr = 0.01
epochs = 100
batch_size = 8
np.random.seed(234242)

## Weights, biases and batches

In [None]:
w = np.random.uniform(low=-0.01, high=0.01, size=(batch_size,1))
b = np.random.uniform(low=-0.01, high=0.01, size=(batch_size,1))

batches = np.array_split(shuffle(np.arange(len(X_train))), 
                                           len(X_train)//batch_size)

## Training function

In [None]:
total_losses = []
total_accuracies = []

for epoch in range(epochs):
    
    losses = 0
    accuracies = 0
    
    for batch in batches:
        X, y = features[batch], labels[batch]

        z = X @ w + b
        z = sigmoid(z)
        l = np.array([cross_entropy(z[i], y).tolist() 
                                              for i, y in enumerate(y)])

        accuracies += np.sum(np.squeeze(np.round(z)) == y)
        losses += l.mean()

        dl = np.array([dcross_entropy(z[i], y).tolist()
                                                for i, y in enumerate(y)])

        dw = ((dl * dsigmoid(z)).T.dot(X)).T
        db = dl * dsigmoid(z)

        w -= lr * dw
        b -= lr * db
        
    total_losses.append(losses / len(batches))
    total_accuracies.append(accuracies / (len(batches) * 8))
    
    if((epoch+1) % 10 == 0):
        print("Epoch:{:3d}, Loss:{:1.3f}, Accuracy:{:1.3f}"
                 .format(epoch+1, total_losses[-1], total_accuracies[-1]))

<h1 id="analysis" style="color:#5f20aa; background:white; border:0.5px dotted #5f20aa;"> 
    <center>Analysis
        <a class="anchor-link" href="#analysis" target="_self">¶</a>
    </center>
</h1>

In [None]:
plt.figure(figsize=(14,8))
plt.title("Train Losses")
plt.plot(total_losses, label='Losses')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()

In [None]:
plt.figure(figsize=(14,8))
plt.title("Train Accuracy")
plt.plot(total_accuracies, label='Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.show()

## Test dataset accuracy

In [None]:
def get_test_accuracy(X_test, y_test):
    y_hat = []
    accuracies = 0
    for i in range(len(X_test[:-1]) // 8):
        X = X_test[i * 8:i * 8 + 8]
        y = y_test[i * 8:i * 8 + 8] 

        z = X @ w + b
        z = sigmoid(z)

        y_hat.append(z.tolist())
        accuracies += np.sum(np.squeeze(np.round(z)) == y)

    y_hat = np.array(y_hat)
    y_hat = y_hat.reshape(y_hat.shape[0] * y_hat.shape[1],)
    y_hat = [np.round(y) for y in y_hat]
    total_accuracy = accuracies / len(X_test[:-1])
    
    return y_hat, total_accuracy

In [None]:
y_hat, test_accuracy = get_test_accuracy(X_test, y_test)
print('Accuracy:{:1.3f}'.format(test_accuracy))

## Classification metrics

In [None]:
target_names = ['beignet', 'malignant']
print(classification_report(y_test[:-1], y_hat, target_names=target_names))