In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [92]:
!ls

breast-cancer-wisconsin.names.txt cancer_v1.ipynb
cancer.txt


In [93]:
#read file
data = pd.read_csv('cancer.txt', sep=",", header=None)

In [94]:
data.shape

(699, 11)

In [95]:
#remove first column containing ids
data = data.drop(data.columns[0], axis=1)

In [96]:
data.shape

(699, 10)

In [97]:
#remove non-neumerical data
from numpy import NaN
for i in range(data.shape[1]):
    for j in range(data.shape[0]):
        if data.iloc[j,i] == '?':
            data.iloc[j,i] = data[i].mean()

In [98]:
#data = data.fillna(data.mode().iloc[0])
ndata =data.values

In [99]:
ndata.shape
ndata.astype('float32')

array([[ 5.,  1.,  1., ...,  1.,  1.,  2.],
       [ 5.,  4.,  4., ...,  2.,  1.,  2.],
       [ 3.,  1.,  1., ...,  1.,  1.,  2.],
       ...,
       [ 5., 10., 10., ..., 10.,  2.,  4.],
       [ 4.,  8.,  6., ...,  6.,  1.,  4.],
       [ 4.,  8.,  8., ...,  4.,  1.,  4.]], dtype=float32)

In [100]:
x = ndata[:,:9]
y = ndata[:,9]//4
print(np.max(y))
print(np.min(y))
print(x.shape)

1
0
(699, 9)


In [101]:
num_feats = x.shape[1]
x.astype(float)
for i in range(num_feats):
    t = x[:,i]
    t = t.astype('float32')
    print(type(t))
    print(t.shape)
    x[:,i] = (t - np.min(t))/(np.max(t) - np.min(t))
    

<class 'numpy.ndarray'>
(699,)
<class 'numpy.ndarray'>
(699,)
<class 'numpy.ndarray'>
(699,)
<class 'numpy.ndarray'>
(699,)
<class 'numpy.ndarray'>
(699,)
<class 'numpy.ndarray'>
(699,)
<class 'numpy.ndarray'>
(699,)
<class 'numpy.ndarray'>
(699,)
<class 'numpy.ndarray'>
(699,)


In [102]:
#add bias term
b = np.ones((x.shape[0],1))
x = np.hstack((x,b))
print(x.shape)

(699, 10)


In [103]:
xtr,x_test,ytr,y_test = train_test_split(x,y,test_size=0.3)

In [104]:
print(ytr.shape)
print(y_test.shape)
print(xtr.shape)
print(x_test.shape)

(489,)
(210,)
(489, 10)
(210, 10)


In [105]:
#uniform random initialization of weights
w = np.random.uniform(0.0,1.0,x.shape[1]).reshape(x.shape[1],1)
print(w)
print(w.shape)

[[0.4536404 ]
 [0.99889206]
 [0.55295653]
 [0.22606621]
 [0.65013754]
 [0.08335878]
 [0.65023691]
 [0.59916851]
 [0.12351377]
 [0.22036178]]
(10, 1)


In [106]:
#define hyper-parameters
epochs = 20000
a = 0.02

In [107]:
#logistic regression with constant learning rate 
train_acc = []
test_acc = []

m = xtr.shape[0]
n = xtr.shape[1]

mte = x_test.shape[0]

for i in range(epochs):
    z = np.dot(xtr,w)
    z = z.astype('float')
    ybar = 1/(1+np.exp(-z))
    ybar = ybar.flatten()
    
    #cross-entropy loss in training set
    err = -np.sum(((1-ytr)*np.log(1-ybar)+ytr*np.log(ybar)))/m
    
    train_acc.append(err)
    
    #cross-entropy loss in test set
    tmp1 = np.dot(x_test,w)
    tmp1 = tmp1.astype('float')
    tmp = 1/(1+np.exp(-tmp1))
    tmp = tmp.flatten()
    
    term1 = (1.0 - y_test)
    term2 = np.log(1.0 - tmp)
    term3 = y_test
    term4 = np.log(tmp)    
    tmp_err = -np.sum(term1*term2 - term3*term4)/mte
    test_acc.append(tmp_err)
    
    #calculate gradient
    one_y = 1 - ytr
    one_yhat = (1 - ybar)
    
    for j in range(n):
        xcurr = xtr[:,j]
        
        ntmp = (one_y*ybar - one_yhat*ytr)*xcurr
        del_w = np.sum(ntmp)/m 
        w[j] = w[j] - a*del_w

    if(i%500 == 0):
        print("Train Error", err)
        print("Test Error", tmp_err)

Train Error 0.6961533167794901
Test Error 0.6769542365882532
Train Error 0.2878749017378847
Test Error 0.11670476984881625
Train Error 0.21700252684643712
Test Error 0.07063377083721245
Train Error 0.18210925049251814
Test Error 0.05521420929122866
Train Error 0.1615591198056721
Test Error 0.047170050612359356
Train Error 0.14810746096170466
Test Error 0.042270725144645326
Train Error 0.13865437840601127
Test Error 0.039020581601927354
Train Error 0.1316633277763958
Test Error 0.03673585445352488
Train Error 0.12629000615478603
Test Error 0.03505904498947315
Train Error 0.12203367795734385
Test Error 0.0337860621087241
Train Error 0.11857936750161165
Test Error 0.03279265528336349
Train Error 0.11571946052755779
Test Error 0.03199932090016518
Train Error 0.11331179754467464
Test Error 0.0313531179330869
Train Error 0.11125590996594682
Test Error 0.03081762044889922
Train Error 0.10947887177854085
Test Error 0.03036707036758419
Train Error 0.10792652469234457
Test Error 0.02998282507785

In [108]:
x_axis = np.arange(len(test_acc))
import matplotlib.pyplot as plt
%matplotlib notebook
plt.plot(x_axis,train_acc,'g')
plt.plot(x_axis,test_acc,'r')
plt.show()

<IPython.core.display.Javascript object>

In [118]:
tmp = tmp//0.51
tmp = tmp.astype('int')

array([1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1])

In [130]:
d = tmp - y_test
dacc = np.abs(dacc)
acc = np.sum(dacc)/tmp.shape[0]
print(acc)

0.02857142857142857


In [131]:
y_true = list(y_test)
y_pred = list(tmp)
score = f1_score(y_true, y_pred, average='macro')
print(score)

0.967622571692877
