![](https://www.foxchase.org/sites/fccc/files/breast-cancer-awareness.jpg)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
cancer_dataset = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

## Dataset Information

* Dataset Characteristics: Multivariate
* Attribute Characteristics: Real
* Attribute Characteristics: Classification
* Number of Instances: 569
* Number of Attributes: 32
* Missing Values: No

## Column Names and Meanings
* id: ID number
* diagnosis: The diagnosis of breast tissues (M = malignant, B = benign)
* radius_mean: mean of distances from center to points on the perimeter
* texture_mean: standard deviation of gray-scale values
* perimeter_mean: mean size of the core tumor
* area_mean: area of the tumor
* smoothness_mean: mean of local variation in radius lengths
* compactness_mean: mean of perimeter^2 / area - 1.0
* concavity_mean: mean of severity of concave portions of the contour
* concave_points_mean: mean for number of concave portions of the contour
* symmetry_mean
* fractal_dimension_mean: mean for "coastline approximation" - 1
* radius_se: standard error for the mean of distances from center to points on the perimeter
* texture_se: standard error for standard deviation of gray-scale values
* perimeter_se
* area_se
* smoothness_se: standard error for local variation in radius lengths
* compactness_se: standard error for perimeter^2 / area - 1.0
* concavity_se: standard error for severity of concave portions of the contour
* concave_points_se: standard error for number of concave portions of the contour
* symmetry_se
* fractal_dimension_se: standard error for "coastline approximation" - 1
* radius_worst: "worst" or largest mean value for mean of distances from center to points on the perimeter
* texture_worst: "worst" or largest mean value for standard deviation of gray-scale values
* perimeter_worst
* area_worst
* smoothness_worst: "worst" or largest mean value for local variation in radius lengths
* compactness_worst: "worst" or largest mean value for perimeter^2 / area - 1.0
* concavity_worst: "worst" or largest mean value for severity of concave portions of the contour
* concave_points_worst: "worst" or largest mean value for number of concave portions of the contour
* symmetry_worst
* fractal_dimension_worst: "worst" or largest mean value for "coastline approximation" - 1

In [None]:
cancer_dataset

![](https://miro.medium.com/max/4000/0*0XRrnsr7h5hebu8r.png)

In [None]:
print(cancer_dataset.info())

Since we do not need "id" and "Unnamed: 32" columns, I am going to drop them.

In [None]:
cancer_dataset.drop(["id", "Unnamed: 32"], axis = 1, inplace=True)

In [None]:
cancer_dataset

In "diagnosis" column, instead of using M or B, I will convert them 0's and 1's.

In [None]:
cancer_dataset.diagnosis = [1 if i == "M" else 0 for i in cancer_dataset.diagnosis]

In [None]:
print(cancer_dataset.info())

In [None]:
cancer_dataset

In [None]:
x = cancer_dataset.drop(["diagnosis"], axis = 1)

In [None]:
type(x)

In [None]:
x

In [None]:
y = cancer_dataset["diagnosis"].values

In [None]:
type(y)

In [None]:
y

In [None]:
features_mean=list(x)
dfM = cancer_dataset[cancer_dataset["diagnosis"] == 1]
dfB = cancer_dataset[cancer_dataset["diagnosis"] == 0]
plt.rcParams.update({"font.size": 10})
fig, axes = plt.subplots(nrows = 5, ncols = 2, figsize=(15,20))
axes = axes.ravel()

for idx,ax in enumerate(axes):
    ax.figure
    binwidth= (max(cancer_dataset[features_mean[idx]]) - min(cancer_dataset[features_mean[idx]]))/50
    ax.hist([dfM[features_mean[idx]],dfB[features_mean[idx]]], bins=np.arange(min(cancer_dataset[features_mean[idx]]),
            max(cancer_dataset[features_mean[idx]]) + binwidth, binwidth) , alpha=0.5, stacked=True, density = True,
            label=["M", "B"], color = ["red","green"])
    ax.legend(loc = "upper right")
    ax.set_title(features_mean[idx] + "(mm)")
plt.tight_layout()
plt.show()

## Normalization

In [None]:
x = (x - np.min(x)) / (np.max(x) - np.min(x)).values    # (x-min(x))/(max(x)-minx)) -> Normalization formula

In [None]:
x

As you see, we converted each values between 0 and 1.

## Creating the Model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 42)

![](https://miro.medium.com/max/2908/1*Hs7RCpyvj4NrjANdwiFHaQ@2x.jpeg)

In [None]:
print("xtrain shape:", xtrain.shape)
print("xtest shape:", xtest.shape)
print("ytrain shape:", ytrain.shape)
print("ytest shape:", ytest.shape)

In [None]:
xtrain_transpose = xtrain.T

In [None]:
xtrain_transpose.head(10)

In [None]:
xtest_transpose = xtest.T
ytrain_transpose = ytrain.T
ytest_transpose = ytest.T

In [None]:
print("Shape of the xtrain_transpose:", xtrain_transpose.shape)
print("Shape of the xtest_transpose:", xtest_transpose.shape)
print("Shape of the ytrain_transpose:", ytrain_transpose.shape)
print("Shape of the ytest_transpose:", ytest_transpose.shape)

## Parameter Initialization and Sigmoid Function

In the following picture, you can see the overall idea. The only difference is we want to predict if the tumor is malignant or benign.

![](https://machinethink.net/images/tensorflow-on-ios/LogisticRegression@2x.png)

In [None]:
def inialize_weights_and_bias(dimension): # dimension = 30
    w = np.full((dimension,1),0.01)
    b = 0.0
    return w,b

In [None]:
w,b = inialize_weights_and_bias(30)
print("w:", w)
print("")
print("b:",b)

### Sigmoid function:

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT_G9kn6bEs0wdfwt9lH4I5R7ZFHb7RHrUxqQ&usqp=CAU)

In [None]:
def sigmoid_function(z):
    result = 1/(1+np.exp(-z))
    return result

In [None]:
sigmoid_function(0) # The result should be 0.5.

## Forward and Backward Propagation

In [None]:
w.shape

In [None]:
xtrain_transpose.shape

* w.shape = (30,1)
* xtrain_transpose.shape = (30,455)

In order to do a matrix multiplication, we need to take the transpose of the weight variables.

In [None]:
def forward_and_backward_propagation(w, b, xtrain_transpose, ytrain_transpose):
    
    # ---------- Forward propagation -----------
    # z = wT . xtrain + b
    z = np.dot(w.T,xtrain_transpose) + b
    y_head = sigmoid_function(z)
    # loss = -(ylog(y^) + (1-y) * log(1-y^))
    loss_function = -(ytrain_transpose * np.log(y_head) + (1-ytrain_transpose) * np.log(1-y_head))
    cost_function = (np.sum(loss_function) / xtrain_transpose.shape[1])
    # xtrain_transpose.shape[1] -> 455 : This part is done for scaling.
    
    # --------- Backward Propagation ------------
    
    derivative_weight = (np.dot(xtrain_transpose, ((y_head-ytrain_transpose).T))) / xtrain_transpose.shape[1]
    derivative_bias = np.sum(y_head-ytrain_transpose) / xtrain_transpose.shape[1]
    gradients = {"derivative_weight": derivative_weight, "derivative_bias": derivative_bias}
    
    return cost_function, gradients  

## Updating Parameters

In [None]:
def update_parameters(w, b, x_train_transpose, y_train_transpose, learning_rate, iteration_number):
    all_costs = []
    each_10_costs = []
    index = []
    
    for i in range(iteration_number):
        
        # --------- finding cost and gradient values -----------
        
        cost, gradients = forward_and_backward_propagation(w, b, x_train_transpose, y_train_transpose)
        all_costs.append(cost)
        
        # --------- Updating weight and bias ----------
        
        w = w - learning_rate * gradients["derivative_weight"]
        b = b - learning_rate * gradients["derivative_bias"]
        
        if i % 10 == 0:
            each_10_costs.append(cost)
            index.append(i)
            print ("Cost after iteration %i: %f" %(i, cost))
            
    # Updating(learning) weights and bias parameters
    parameters = {"weight": w,"bias": b}
    plt.figure(figsize=(10,6))
    plt.plot(index, each_10_costs, color = "orange")
    plt.xticks(index, rotation='vertical')
    plt.xlabel("Iteration number")
    plt.ylabel("Cost")
    plt.show()
    return parameters, gradients, all_costs

## Prediction Part

In [None]:
def predict(w, b, xtest_transpose):
    
    z = sigmoid_function(np.dot(w.T, xtest_transpose) + b)
    prediction = np.zeros((1,xtest_transpose.shape[1]))
    
    # z > 0.5 -> y_head=1
    # z < 0.5 -> y_head=0
    
    for i in range(z.shape[1]):
        if z[0,i]<= 0.5:
            prediction[0,i] = 0
        else:
            prediction[0,i] = 1

    return prediction

## Logistic Regression

In [None]:
def logistic_regression(xtrain_transpose, ytrain_transpose, xtest_transpose, ytest_transpose, learning_rate ,  iteration_number):

    dimension =  xtrain_transpose.shape[0] 
    w, b = inialize_weights_and_bias(dimension)
    parameters, gradients, all_costs = update_parameters(w, b, xtrain_transpose, ytrain_transpose, learning_rate, iteration_number)
    
    prediction = predict(parameters["weight"],parameters["bias"],xtest_transpose)

    # Print test Errors
    print("test accuracy: {} %".format(100 - np.mean(np.abs(prediction - ytest_transpose)) * 100))

In [None]:
logistic_regression(xtrain_transpose, ytrain_transpose, xtest_transpose, ytest_transpose, learning_rate = 1, iteration_number = 300)    