In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# data source link:
# https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data?resource=download
data = pd.read_csv("data/archive/data.csv")

In [3]:
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
diagnosis_map = {'M':1, 'B':-1}

In [5]:
data['diagnosis'] = data['diagnosis'].map(diagnosis_map)

In [6]:
data.drop(data.columns[[-1, 0]], axis=1, inplace=True)

In [7]:
data.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
Y = data.loc[:, 'diagnosis']

In [9]:
X = data.iloc[:, 1:] 

In [10]:
X_normalized = MinMaxScaler().fit_transform(X.values)

In [11]:
X = pd.DataFrame(X_normalized)

In [12]:
from sklearn.model_selection import train_test_split as tts

In [13]:
X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)

In [14]:
X_train.shape

(455, 30)

In [15]:
def compute_cost(reg_strength, yi,z, w):
    N = z.shape[0]
    
    hinge_loss = reg_strength * np.maximum(0, 1 - yi*z).sum() / N
    cost = 0.5*np.dot(w,w) + hinge_loss
    return cost

In [16]:
def svm(x,w,b):
    z = np.dot(x,w) + b
    return z

In [17]:
def compute_gradient(x, yi, z, w, b, reg_strength):
    
    dw = np.zeros(x.shape)
    
    db = np.zeros(x.shape[0])
    
    loss_zero_indices = np.where(np.maximum(0, 1 - yi*z)==0)[0]
    loss_non_zero_indices = np.where(np.maximum(0, 1 - yi*z)!=0)[0]
    
    dw[loss_zero_indices] = w
    
    dw[loss_non_zero_indices] = w - x[loss_non_zero_indices]*yi[loss_non_zero_indices].reshape(-1,1)*reg_strength
    
    dw = dw.mean(0)
    
    db[loss_non_zero_indices] = -reg_strength*yi[loss_non_zero_indices]
    
    db = db.mean(0)
    
    return dw,db

In [18]:
def train_loop(X_train,y_train,reg_strength=10000,lr=0.000001,epochs=5000):
    weights = np.zeros(X_train.shape[1])
    bias = np.ones(1)
        
    for epoch in range(epochs):
        
        for i in range(0,len(X_train),32):
            x_batch = X_train[i:i+32]
            y_batch = y_train[i:i+32]
            z = svm(x_batch,weights,bias)
            cost = compute_cost(reg_strength,y_batch, z, weights)
            dw,db = compute_gradient(x_batch,y_batch,z,weights, bias, reg_strength)
        
        if (epoch+1)%500==0:
            
            print(f"epoch = {epoch+1}, cost = {cost}")
        weights = weights - lr*dw
        bias = bias - lr*db
        
    return weights,bias

In [19]:
weights,bias = train_loop(X_train.to_numpy(),y_train.to_numpy())

epoch = 500, cost = 5820.993581594298
epoch = 1000, cost = 4125.597237467136
epoch = 1500, cost = 2752.177066261198
epoch = 2000, cost = 1445.5004584346518
epoch = 2500, cost = 1174.9147426783366
epoch = 3000, cost = 940.0713880113185
epoch = 3500, cost = 705.5165149262159
epoch = 4000, cost = 471.49734362353
epoch = 4500, cost = 237.33959738781363
epoch = 5000, cost = 10.484544130960854


In [21]:
weights

array([ 0.62507405, -0.31629701,  0.64196856,  0.69212837, -0.75902792,
        0.09291731,  1.33596283,  1.77818117, -1.14630078, -0.61472096,
        0.288543  , -0.86811997,  0.27218941,  0.35851856, -0.64145014,
       -0.35831545, -0.06079913, -0.28273895, -0.51663309, -0.22248662,
        1.12345877, -0.16117158,  1.14198226,  0.9324921 , -0.12263141,
        0.50677264,  0.96349698,  2.1439409 , -0.10614843,  0.62026399])

### Training data metrics

In [22]:
z = svm(X_train.to_numpy(),weights,bias)

In [23]:
z[z>=0] = 1

In [24]:
z[z<0] = -1

In [25]:
diagnosis_reverse_map = { 1:'M',  -1:'B'}

In [26]:
predictions = pd.DataFrame(z)[0].map(diagnosis_reverse_map).to_numpy()

In [27]:
y = y_train.map(diagnosis_reverse_map).to_numpy()

In [19]:
from collections import defaultdict
def metrics_calculation(preds, y, labels):
    
    metrics = {}
    for label in labels:
        metrics[label] = defaultdict(int)
        for p,a in zip(preds,y):
            if p==label and a==label:
                metrics[label]["tp"]+=1
            if p==label and a!=label:
                metrics[label]["fp"]+=1
            if p!=label and a==label:
                metrics[label]["fn"]+=1
            if p!=label and a!=label:
                metrics[label]["tn"]+=1
                
        
        try:
            metrics[label]["precision"] = metrics[label]["tp"]/(metrics[label]["tp"]+metrics[label]["fp"])
        except:
            metrics[label]["precision"] = 0
        try:
            metrics[label]["recall"] = metrics[label]["tp"]/(metrics[label]["tp"]+metrics[label]["fn"])
        except:
            metrics[label]["recall"] = 0
        try:
            metrics[label]["f1-score"] = 2*metrics[label]["precision"]*metrics[label]["recall"]/(metrics[label]["precision"]+metrics[label]["recall"])
        except:
            metrics[label]["f1-score"] = 0
    
    return metrics
    

In [29]:
metrics_calculation(predictions,y,["M","B"])

{'M': defaultdict(int,
             {'tn': 258,
              'tp': 158,
              'fp': 28,
              'fn': 11,
              'precision': 0.8494623655913979,
              'recall': 0.9349112426035503,
              'f1-score': 0.8901408450704226}),
 'B': defaultdict(int,
             {'tp': 258,
              'tn': 158,
              'fn': 28,
              'fp': 11,
              'precision': 0.9591078066914498,
              'recall': 0.9020979020979021,
              'f1-score': 0.9297297297297297})}

### Test data metrics

In [30]:
z = svm(X_test.to_numpy(),weights,bias)

In [31]:
z[z>=0] = 1

In [32]:
z[z<0] = -1

In [33]:
predictions = pd.DataFrame(z)[0].map(diagnosis_reverse_map).to_numpy()

In [34]:
yt = y_test.map(diagnosis_reverse_map).to_numpy()

In [35]:
metrics_calculation(predictions,yt,["M","B"])

{'M': defaultdict(int,
             {'tn': 68,
              'tp': 42,
              'fp': 3,
              'fn': 1,
              'precision': 0.9333333333333333,
              'recall': 0.9767441860465116,
              'f1-score': 0.9545454545454545}),
 'B': defaultdict(int,
             {'tp': 68,
              'tn': 42,
              'fn': 3,
              'fp': 1,
              'precision': 0.9855072463768116,
              'recall': 0.9577464788732394,
              'f1-score': 0.9714285714285714})}

### Guassian Kernel

In [20]:
def euclidean_distance(x,y):
    return np.sqrt(np.sum(np.square(x-y)))

In [21]:
def guassian_kernel(x_batch,landmark,sigmoid=0.5):
    
    a = np.repeat(x_batch,landmark.shape[0],0).reshape(x_batch.shape[0],landmark.shape[0],landmark.shape[1])
    b = x.reshape(1,landmark.shape[0],landmark.shape[1])

    distance = np.sqrt(np.sum(np.square(a - b),-1))
    
    return np.exp(-np.square(distance)/(2*np.square(sigmoid)))
    


In [22]:
x = X_train.to_numpy()

In [23]:
transformed_x = guassian_kernel(x,x)

In [24]:
# weights,bias = train_loop(transformed_x,y_train.to_numpy(),reg_strength=10000,lr=1e-3,epochs=1000)
weights,bias = train_loop(transformed_x,y_train.to_numpy())

epoch = 500, cost = 0.6592643164843165
epoch = 1000, cost = 0.6586053813608376
epoch = 1500, cost = 0.6579471048434532
epoch = 2000, cost = 0.6572894862738865
epoch = 2500, cost = 0.6566325249945182
epoch = 3000, cost = 0.6559762203483858
epoch = 3500, cost = 0.6573703596518499
epoch = 4000, cost = 0.6567133175394865
epoch = 4500, cost = 0.6560569321411517
epoch = 5000, cost = 0.6554012028004609


In [25]:
weights.shape

(455,)

### train data metrics

In [26]:
z = svm(transformed_x,weights,bias)

In [27]:
z[z>=0] = 1

In [28]:
z[z<0] = -1

In [31]:
diagnosis_reverse_map = { 1:'M',  -1:'B'}

In [32]:
predictions = pd.DataFrame(z)[0].map(diagnosis_reverse_map).to_numpy()

In [33]:
y = y_train.map(diagnosis_reverse_map).to_numpy()

In [34]:
metrics_calculation(predictions,y,["M","B"])

{'M': defaultdict(int,
             {'fp': 6,
              'tp': 149,
              'tn': 280,
              'fn': 20,
              'precision': 0.9612903225806452,
              'recall': 0.8816568047337278,
              'f1-score': 0.9197530864197531}),
 'B': defaultdict(int,
             {'fn': 6,
              'tn': 149,
              'tp': 280,
              'fp': 20,
              'precision': 0.9333333333333333,
              'recall': 0.9790209790209791,
              'f1-score': 0.9556313993174061})}

### test data metrics

In [35]:
transformed_xtest = guassian_kernel(X_test.to_numpy(),x)

In [37]:
z = svm(transformed_xtest,weights,bias)

In [38]:
z[z>=0] = 1

In [39]:
z[z<0] = -1

In [40]:
diagnosis_reverse_map = { 1:'M',  -1:'B'}

In [41]:
predictions = pd.DataFrame(z)[0].map(diagnosis_reverse_map).to_numpy()

In [42]:
yt = y_test.map(diagnosis_reverse_map).to_numpy()

In [43]:
metrics_calculation(predictions,yt,["M","B"])

{'M': defaultdict(int,
             {'tn': 70,
              'tp': 39,
              'fp': 1,
              'fn': 4,
              'precision': 0.975,
              'recall': 0.9069767441860465,
              'f1-score': 0.9397590361445783}),
 'B': defaultdict(int,
             {'tp': 70,
              'tn': 39,
              'fn': 1,
              'fp': 4,
              'precision': 0.9459459459459459,
              'recall': 0.9859154929577465,
              'f1-score': 0.9655172413793103})}