In [2]:
import numpy as np  # for handling multi-dimensional array operation
import pandas as pd  # for reading data from csv
#import statsmodels.api as sm  # for finding the p-value
from sklearn.preprocessing import MinMaxScaler  # for normalization
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [3]:
# normalizing and pre processing
data = pd.read_csv('data/data.csv')
diagnosis_map = {'M':1, 'B':-1}
data['diagnosis'] = data['diagnosis'].map(diagnosis_map)
data.drop(data.columns[[-1, 0]], axis=1, inplace=True)

Y = data.loc[:, 'diagnosis']  # all rows of 'diagnosis'
X = data.iloc[:, 1:]  # all rows of column 1 and ahead (features)

X_normalized = MinMaxScaler().fit_transform(X.values)
X = pd.DataFrame(X_normalized)

X.insert(loc=len(X.columns), column='intercept', value=1)

print("splitting dataset into train and test sets...")
X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)

splitting dataset into train and test sets...


In [4]:
# trainnnnnnnnnnnnnnnnn
def compute_cost(W,X,Y):
    N=X.shape(0)
    reg_strength = 10000
    distances=1-(Y*(np.dot(W,X)))
    print(distances[distances<0])
    distances[distances < 0] = 0
    hinge_loss = reg_strength * (np.sum(distances) / N)
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

def calculate_grad_descent(x,y,W):

    reg_strength = 10000
    if (type(y) == np.int64 or type(y)==np.float64) :
        y = np.array([y])
        x = np.array([x])

    distance=1-(y*np.dot(x,W))
    derivative=np.zeros(len(W))

    for index,dist in enumerate(distance):
        if max(0,dist)==0:
            der=W
        else:
            der=W-(reg_strength*x*y)
        # print(np.shape(derivative))
        # print(np.shape(der))

        derivative+=np.squeeze(der)

    derivative=derivative/len(y)
    return derivative

def sgd(features,outputs):
    max_iters=5000

    weights = np.zeros(features.shape[1])
    for iters in range(1,max_iters):
        X,Y=shuffle(features,outputs)
        for i,x in enumerate(X):
            ascent=calculate_grad_descent(x,Y[i],weights)
            weights=weights-(0.000001*ascent)

    return weights
print("training started...")
W = sgd(X_train.to_numpy(), y_train.to_numpy())
print("training finished.")
print("weights are: {}".format(W))


training started...
training finished.
weights are: [ 1.33185938  0.82783102  1.13078309  2.16265842 -1.22177303 -3.24353713
  3.27245874  6.8289059  -0.46963669  0.0950705   5.67581027 -1.90599812
  3.27032199  3.7674242   1.6760621  -2.4323425  -1.76603537  0.83952671
 -1.95902488 -1.85011806  2.70002631  5.33487556  1.04062417  3.07882795
  2.22980385 -0.61909835  2.6648575   0.01463418  4.66134113  2.17935934
 -9.28848196]


In [19]:
# modell results

y_test_predicted=np.array([])
for i,d in enumerate(y_test):
    yp=np.sign(np.dot(W,X_test.to_numpy()[i]))
    y_test_predicted=np.append(y_test_predicted,yp)

print("accuracy on test dataset: {}".format(accuracy_score(y_test.to_numpy(), y_test_predicted)))
#print("recall on test dataset: {}".format(recall_score(y_test.to_numpy(), y_test_predicted)))
#print("precision on test dataset: {}".format(recall_score(y_test.to_numpy(), y_test_predicted)))

accuracy on test dataset: 0.9736842105263158
