In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.cross_validation import cross_val_score, train_test_split



# Data Loading

In [2]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header = None)

# Normalization and Dummifying target Variables

In [3]:
data.iloc[:,2:] = ((data.iloc[:,2:]- data.iloc[:,2:].min())/(data.iloc[:,2:].max()-data.iloc[:,2:].min()))

In [4]:
data.loc[data[1]=='M',1] = 0 # mapping Malignant to 0

In [5]:
data.loc[data[1]=='B',1] = 1 #mapping Bening to 1

In [6]:
data[1].value_counts() # Balanced Data

1    357
0    212
Name: 1, dtype: int64

# Test Train Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.loc[:,2:], data[1], test_size=0.2)

# Defining Activation function 

In [8]:
def activation(logistic_score):
    return 1 / (1 + np.exp(-logistic_score))

In [9]:
def logistic_score(ind_variables, coefficients):
    logistic_score = np.dot(ind_variables, coefficients)
    return logistic_score

In [10]:
def log_likelihood(ind_variables, coefficients, dep_variable):
    ll = np.sum(dep_variable*logistic_score(ind_variables, coefficients) - np.log(1 + np.exp(logistic_score(ind_variables, coefficients))) )
    return ll    

# Logistic Regression Function Definition

In [11]:
def logistic_regression(ind_variables, dep_variable, num_steps, learning_rate):
    
    coefficients = np.zeros(ind_variables.shape[1])
    
    for step in range(num_steps):
        logistic_scores = logistic_score(ind_variables, coefficients)
        predictions = activation(logistic_scores)

        # Updating weights with respect to output error signals
        output_error_signal = dep_variable - predictions
        gradient = np.dot(ind_variables.T, output_error_signal)
        coefficients += learning_rate * gradient
        
        # Checking log likelihood
        if step % 10000 == 0:
            print(log_likelihood(ind_variables, coefficients, dep_variable))
        
    return coefficients

# Training and accuracy report

In [12]:
coefficients= logistic_regression(X_train, y_train, 
                     num_steps = 300000, learning_rate = 5e-5)

-315.21755214582754
-133.67893353454517
-112.50483197823763
-100.9645695145076
-93.21512620110084
-87.51998199103748
-83.10457954686657
-79.55338598829337
-76.61811493960617
-74.1394517812425
-72.00982399572293
-70.153724154381
-68.5164840253731
-67.05749335590832
-65.74591830771033
-64.55790046108571
-63.4746695957078
-62.48123973507218
-61.565488191129774
-60.717492336210285
-59.92904357992873
-59.19328555282884
-58.50444087416544
-57.85760210430113
-57.24856988090205
-56.67372620644057
-56.12993424732945
-55.61445835791336
-55.12489969761522
-54.6591439891868


In [13]:
final_scores = np.dot(X_train, coefficients)
preds = np.round(activation(final_scores))

print('Final Train Accuracy: {0}'.format((preds == y_train).sum().astype(float) / len(preds)))

Final Train Accuracy: 0.9604395604395605


In [14]:
final_scores = np.dot(X_test, coefficients)
preds = np.round(activation(final_scores))

print('Final test Accuracy: {0}'.format((preds == y_test).sum().astype(float) / len(preds)))

Final test Accuracy: 0.9736842105263158
