In [1]:
import numpy as np  # for handling multi-dimensional array operation
import pandas as pd  # for reading data from csv
#import statsmodels.api as sm  # for finding the p-value
from sklearn.preprocessing import MinMaxScaler  # for normalization
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [2]:
# normalizing and pre processing
data = pd.read_csv('data/data.csv')
diagnosis_map = {'M':1, 'B':-1}
data['diagnosis'] = data['diagnosis'].map(diagnosis_map)
data.drop(data.columns[[-1, 0]], axis=1, inplace=True)

Y = data.loc[:, 'diagnosis']  # all rows of 'diagnosis'
X = data.iloc[:, 1:]  # all rows of column 1 and ahead (features)

X_normalized = MinMaxScaler().fit_transform(X.values)
X = pd.DataFrame(X_normalized)

X.insert(loc=len(X.columns), column='intercept', value=1)

print("splitting dataset into train and test sets...")
X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)

splitting dataset into train and test sets...


In [58]:
def compute_cost(W,X,Y):
    N=X.shape(0)
    reg_strength = 10000
    distances=1-(Y*(np.dot(W,X)))
    print(distances[distances<0])
    distances[distances < 0] = 0
    hinge_loss = reg_strength * (np.sum(distances) / N)
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

def calculate_grad_descent(x,y,W):

    reg_strength = 10000
    if (type(y) == np.int64 or type(y)==np.float64) :
        y = np.array([y])
        x = np.array([x])

    distance=1-(y*np.dot(x,W))
    derivative=np.zeros(len(W))

    for index,dist in enumerate(distance):
        if max(0,dist)==0:
            der=W
        else:
            der=W-(reg_strength*x*y)
        # print(np.shape(derivative))
        # print(np.shape(der))

        derivative+=np.squeeze(der)

    derivative=derivative/len(y)
    return derivative

def sgd(features,outputs):
    max_iters=5000

    weights = np.zeros(features.shape[1])
    for iters in range(1,max_iters):
        X,Y=shuffle(features,outputs)
        for i,x in enumerate(X):
            ascent=calculate_grad_descent(x,Y[i],weights)
            weights=weights-(0.000001*ascent)

    return weights
print("training started...")
W = sgd(X_train.to_numpy(), y_train.to_numpy())
print("training finished.")
print("weights are: {}".format(W))



training started...
training finished.
weights are: [ 1.32675592  0.85567382  1.1264457   2.16314214 -1.25509301 -3.24777489
  3.29595742  6.82810493 -0.44666315  0.10182917  5.68928993 -1.91664286
  3.27402454  3.77409124  1.67948736 -2.44641706 -1.75093224  0.8523407
 -1.97762148 -1.84565566  2.70150103  5.34595067  1.03982663  3.08100613
  2.21475458 -0.64032647  2.67626187  0.02001338  4.66654438  2.164874
 -9.26857721]
