## Breast Cancer ( 1 = Malignant(M) vs 0 = Benign(B) ) 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

### Breast cancer dataset 
This dataset consists of 10 features whose values are obtained in terms of Mean, Standard error and worst case.
First two columns gives us the information about ID and The output itself (Maliganat or Benign).

**Instruction for data-preprocessing**
- First load the data.csv file to dataframe.
- Then divide it to test and train dataset in 80:20.
- Normalize the data i.e. feature normalization.

In [None]:
#LOADING THE DATA From data.csv
df = pd.read_csv('../input/data.csv')
df['diagnosis'] = df['diagnosis'].map({'M':1,'B':0})

#Complete data set:
X = df[df.columns[2:32]]
Y = df['diagnosis']
Y = Y.values.reshape(Y.shape[0],1)


#train set (80%):
train_X = X.loc[0:454,X.columns[0:]]
train_Y = Y[0:455]

#test set (20%):
test_X = X.loc[0:143,X.columns[0:]]
test_Y = Y[0:144]


### Normalization
Let's find out the mean and standered deviation for each column and substract each element from its mean and divide this by its standered deviation.

In [None]:
#training set:

mean = train_X.mean()
std_error = train_X.std()
train_X = (train_X - mean)/std_error

#test set:
mean = test_X.mean()
std_error = test_X.std()
test_X = (test_X - mean)/std_error

### Shapes of all the sets
**Shapes**
- train_X
- test_X
- train_Y
- test_Y

In [None]:
print("Shape of train_X",train_X.shape)
print("Shape of test_X",test_X.shape)
print("Shape of train_Y",train_Y.shape)
print("Shape of test_Y",test_Y.shape)

### Algorithm for logistic regression
**Steps**
- random initialization of w and b
- Forward propogation
- Backward propogation
- gradient descent

### Sigmoid function
This function is required for calculating the hypothesis i.e. y = a = sigmoid(z)
where z = w^TX + b

In [None]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [None]:
sigmoid(np.array([1,2,3,3]))


### Random initialization of w and b

In [None]:
def random_init(dim):
    w = np.zeros((dim,1))
    b = 0
    
    return w,b


In [None]:
(random_init(train_X.shape[1]))

### Forward and backward propogation
hypothesis in logistic regression is y = a = sigmoid(z) = sigmoid(w^TX + b)

In [None]:
def propo(w,b,X,Y):
    
    m = X.shape[0]
    
    #forward propogation
    z = np.dot(X,w) + b
    a = sigmoid(z)
    cost = -np.sum(Y*np.log(a) - (1-Y)*np.log(1-a))/m
    
    
    #backpropogation:
    dz = a-Y
    dw  = np.dot(np.transpose(X),dz)/m
    db = np.sum(dz)/m
    
    grad = {
        "dw":dw,
        "db":db
    }
    return grad,cost

### Gradient descent over number of iteration

In [None]:
def optim(w,b,X,Y,learning_rate,num_iteration):
    costs = []
    
    for i in range(num_iteration):
        grads, cost=propo(w,b,X,Y)
        
        dw = grads["dw"]
        db = grads["db"]
        
        #updating w and b
        w  = w - learning_rate*dw
        b  = b - learning_rate*db
          
        if(i%100==0):
            costs.append(cost)
        
    params= {
        "w":w,
        "b":b
    }
    grads = {
        "dw":dw,
        "db":db
    }
    return params,grads,costs

In [None]:
#random init of w,b
w,b = random_init(train_X.shape[1])

#forward, backward & grad. descent:

params,grads,costs = optim(w,b,train_X,train_Y,0.01,2000)

print(params)
print(grads)
print(costs)

### Cost vs iteration graph
For checking learning rate

In [None]:
# plt.plot(cost_all,range(len(cost_all)))
costs = np.squeeze(costs)
plt.plot(costs)
plt.xlabel('No. of iteration')
plt.ylabel('Cost')
plt.show()

In [None]:
def predict(w,b,X):
    a = sigmoid(np.dot(X,w) + b)
    return a

In [None]:
def oneORzero(x):
    if(x>=0.5):
        return 1
    elif(x<0.5):
        return 0


### Prediction accuracy for Train and test set

In [None]:
# Accuracy for training set:
temp = predict(params["w"],params["b"],train_X)
train_prediction = np.array(list(map(oneORzero,temp)))
train_prediction = train_prediction.reshape((train_prediction.shape[0],1))

# Accuracy for test set:
temp = predict(params["w"],params["b"],test_X)
test_prediction = np.array(list(map(oneORzero,temp)))
test_prediction = test_prediction.reshape((test_prediction.shape[0],1))

print("Training set accuracy = ",(100 - np.mean(np.abs(train_prediction - train_Y))*100))
print("Test set accuracy = ",(100 - np.mean(np.abs(test_prediction - test_Y))*100))