In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


We are dropping columns - 'id' and 'Unnamed: 32' as they have no role in prediction

data['diagnosis'].map(): Converts the diagnosis column, which contains 'M' (Malignant) and 'B' (Benign) into binary values. 'M' is converted to 1 and 'B' to 0 making it suitable for logistic regression.

In [2]:
# Processing Dataset
data.drop(['Unnamed: 32', 'id'], axis=1, inplace=True)
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})

# Input and Output data
y = data['diagnosis'].values
x_data = data.drop(['diagnosis'], axis=1)

In [4]:
# Normalization
x = (x_data - x_data.min()) / (x_data.max() - x_data.min())

# Splitting data for training and testing.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size = 0.15, random_state = 42)

# x_train= x_train.T: Transpose(T) to ensure that data has correct shape for matrix operations during the logistic regression.
x_train = x_train.T
x_test = x_test.T
y_train = y_train.T
y_test = y_test.T

print("x train: ", x_train.shape)
print("x test: ", x_test.shape)
print("y train: ", y_train.shape)
print("y test: ", y_test.shape)

x train:  (30, 483)
x test:  (30, 86)
y train:  (483,)
y test:  (86,)


In [8]:
# Initializing Model Architecture


#Initializing Weight and bias
def initialize_weights_and_bias(dimension):
    w = np.random.randn(dimension, 1) * 0.01  
    b = 0.0
    return w, b

# Sigmoid Function to calculate z value.
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Forward-Backward Propagation
def forward_backward_propagation(w, b, x_train, y_train):
    m = x_train.shape[1]
    z = np.dot(w.T, x_train) + b
    y_head = sigmoid(z)
    
   
    cost = (-1/m) * np.sum(y_train * np.log(y_head) + (1 - y_train) * np.log(1 - y_head))
    
    derivative_weight = (1/m) * np.dot(x_train, (y_head - y_train).T)
    derivative_bias = (1/m) * np.sum(y_head - y_train)
    
    gradients = {"derivative_weight": derivative_weight, "derivative_bias": derivative_bias}
    return cost, gradients

# Updating Parameters

# Weight(w) and Bias(b) are updated by subtracting the gradient scaled by the learning rate.
def update(w, b, x_train, y_train, learning_rate, num_iterations):
    costs = []
    gradients = {}
    for i in range(num_iterations):
        cost, grad = forward_backward_propagation(w, b, x_train, y_train)
        w -= learning_rate * grad["derivative_weight"]
        b -= learning_rate * grad["derivative_bias"]

        if i % 100 == 0:
            costs.append(cost)
            print(f"Cost after iteration {i}: {cost}")

    parameters = {"weight": w, "bias": b}
    return parameters, gradients, costs

Forward-Backward Propagation

np.dot(w.T, x_train): Computes the matrix multiplication of the weights and the input data.

cost = (-1/m) * np.sum(y_train * np.log(y_head) + (1 - y_train) * np.log(1 - y_head)): Measures the difference between the predicted probability (y_head) and true label (y_train).

derivative_weight = (1/m) * np.dot(x_train, (y_head - y_train).T): This calculates the gradient of the cost with respect to the weights w. It tells us how much we need to change the weights to reduce the cost.

derivative_bias = (1/m) * np.sum(y_head - y_train): This computes the gradient of the cost with respect to the bias b. It is simply the average of the difference between predicted probabilities (y_head) and actual labels (y_train).

In [9]:
# Making Predictions
def predict(w, b, x_test):
    m = x_test.shape[1]
    y_prediction = np.zeros((1, m))
    z = sigmoid(np.dot(w.T, x_test) + b)

    for i in range(z.shape[1]):
        y_prediction[0, i] = 1 if z[0, i] > 0.5 else 0

    return y_prediction

In [10]:
# Logistic Regression
def logistic_regression(x_train, y_train, x_test, y_test, learning_rate=0.01, num_iterations=1000):
    dimension = x_train.shape[0]
    w, b = initialize_weights_and_bias(dimension)
    parameters, gradients, costs = update(w, b, x_train, y_train, learning_rate, num_iterations)
    
    y_prediction_test = predict(parameters["weight"], parameters["bias"], x_test)
    y_prediction_train = predict(parameters["weight"], parameters["bias"], x_train)
    
    print(f"Train accuracy: {100 - np.mean(np.abs(y_prediction_train - y_train)) * 100}%")
    print(f"Test accuracy: {100 - np.mean(np.abs(y_prediction_test - y_test)) * 100}%")

logistic_regression(x_train, y_train, x_test, y_test, learning_rate=0.01, num_iterations=1000)

Cost after iteration 0: 0.6940653448672531
Cost after iteration 100: 0.6664684710926668
Cost after iteration 200: 0.6419426094106131
Cost after iteration 300: 0.6195184317848816
Cost after iteration 400: 0.5988649785711659
Cost after iteration 500: 0.5797903540476342
Cost after iteration 600: 0.5621407317886556
Cost after iteration 700: 0.5457801712896718
Cost after iteration 800: 0.5305863581910254
Cost after iteration 900: 0.5164492398966638
Train accuracy: 90.47619047619048%
Test accuracy: 88.37209302325581%
