In [111]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
import sklearn.datasets
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

%matplotlib inline

In [112]:
cancer_data = sklearn.datasets.load_breast_cancer()

In [113]:
cancer_data.data[:1]

array([[  1.79900000e+01,   1.03800000e+01,   1.22800000e+02,
          1.00100000e+03,   1.18400000e-01,   2.77600000e-01,
          3.00100000e-01,   1.47100000e-01,   2.41900000e-01,
          7.87100000e-02,   1.09500000e+00,   9.05300000e-01,
          8.58900000e+00,   1.53400000e+02,   6.39900000e-03,
          4.90400000e-02,   5.37300000e-02,   1.58700000e-02,
          3.00300000e-02,   6.19300000e-03,   2.53800000e+01,
          1.73300000e+01,   1.84600000e+02,   2.01900000e+03,
          1.62200000e-01,   6.65600000e-01,   7.11900000e-01,
          2.65400000e-01,   4.60100000e-01,   1.18900000e-01]])

In [114]:
cancer_data.target[:50]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1])

In [115]:
def load_data(data, split_ratio = 0.2):
    x = data['data']
    y = data['target']
    return train_test_split(x,y,test_size=split_ratio)

In [116]:
np.random.seed(0)
train_x,test_x,train_y,test_y = load_data(cancer_data, 0.2)

In [117]:
train_x.shape

(455L, 30L)

In [118]:
def initialize_weights(fs):
    #weights = np.zeros(fs)
    return np.zeros(fs)

In [119]:
def log_likelihood(features, weights, target):
    scores = np.dot(features, weights)
    ll = np.sum( target*scores - np.log(1 + np.exp(scores)) )
    return ll

In [120]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [121]:
def logistic_regression(features, target, steps=100000, learning_rate = 0.0005):
    
    weights = initialize_weights(features.shape[1])
    
    for i in xrange(steps):
        scores = np.dot(features, weights)
        prediction = sigmoid(scores)
        error = target - prediction
        
        gradient = np.dot(np.transpose(features), error)
        weights += learning_rate * gradient
        
        if(i % 10000 == 0):
            print(log_likelihood(features,weights,target))
            print(gradient)
            
    weights = weights.reshape(weights.shape[0],1)
    return weights

In [122]:
def predict(weights, x = test_x):
    pred = []
    py = np.dot(x,w)
    print(py.shape)
    for p in py:
        if p > 0:
            pred.append(1)
        else:
            pred.append(0)
    print(len(pred))
    return np.array(pred)

In [123]:
w = logistic_regression(train_x,train_y,1000000,0.0005)
pred_test_y = predict(w)
pred_train_y = np.array(predict(w,train_x))

-3975676.05749
[  3.22493500e+02   8.25110000e+02   1.82598000e+03  -1.38818500e+04
   4.93851500e+00  -2.97585000e-01  -6.27954815e+00  -3.49949900e+00
   9.32765000e+00   3.95440500e+00  -9.81310000e+00   7.77054500e+01
  -6.88573500e+01  -3.06708200e+03   4.94762000e-01   5.13441500e-01
   5.84828800e-01   2.35450500e-01   1.30863000e+00   1.96678150e-01
   1.93002000e+02   9.93565000e+02   9.47735000e+02  -3.77194000e+04
   6.20018500e+00  -4.28508000e+00  -1.18901015e+01  -4.17178450e+00
   1.24028500e+01   3.98214000e+00]


  from ipykernel import kernelapp as app
  app.launch_new_instance()


-inf
[  4.37316000e+02   5.66080000e+02   2.79228000e+03   1.93143000e+04
   2.50858000e+00   1.56616000e+00   4.44849000e-01   5.38106000e-01
   4.69830000e+00   1.70885000e+00   1.14647000e+01   3.51220000e+01
   7.20140000e+01   1.00249000e+03   1.50112000e-01   4.38300000e-01
   4.53162000e-01   2.33957000e-01   4.74929000e-01   8.73908000e-02
   4.97890000e+02   7.43390000e+02   3.17601000e+03   2.48231000e+04
   3.24219000e+00   2.97642000e+00   2.24250000e+00   1.77283000e+00
   7.12940000e+00   2.01503000e+00]
-inf
[  4.23876000e+02   5.58380000e+02   2.70811000e+03   1.87481000e+04
   2.44232000e+00   1.55748000e+00   4.67350000e-01   5.54610000e-01
   4.57320000e+00   1.65611000e+00   1.10469000e+01   3.50247000e+01
   6.94730000e+01   9.66560000e+02   1.46691000e-01   4.33259000e-01
   4.51401000e-01   2.33960000e-01   4.60649000e-01   8.51038000e-02
   4.82800000e+02   7.37840000e+02   3.08139000e+03   2.41042000e+04
   3.17195000e+00   2.96575000e+00   2.34336000e+00   1.8

In [126]:
print("Test Accuracy ", accuracy_score(pred_test_y,test_y) * 100 , '%')
print("Train Accuracy ", accuracy_score(train_y, pred_train_y) * 100 , '%')

('Test Accuracy ', 95.614035087719301, '%')
('Train Accuracy ', 94.285714285714278, '%')
