In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import sklearn
import random
import math
import sys

## Data

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle
import statsmodels.api as sm


def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped


def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    print(X.shape)
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped

def preprocess_data(df):
    # Applying Label values
    label_map = {'M': 1.0, 'B': -1.0}
    df['diagnosis'] = df.diagnosis.apply(label_map.get)
    
    # Remove unnecessary columns
    df.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)
    
    # Separate label and training data
    X = df.iloc[:, 1:]
    Y = df.loc[:, 'diagnosis']
    
    # Normalize data
    sc = MinMaxScaler()
    X_norm = sc.fit_transform(X)
    X = pd.DataFrame(X_norm)
    
    remove_correlated_features(X)
    remove_less_significant_features(X, Y)
    
    # Do train test split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [None]:
path = '/kaggle/input/breast-cancer-wisconsin-data/data.csv'
df = pd.read_csv(path)
X_train, X_test, y_train, y_test = preprocess_data(df) 
list(map(lambda x: len(x), [X_train, X_test, y_train, y_test]))

In [None]:
df.dtypes

# SVM Model Implementation
- https://towardsdatascience.com/svm-implementation-from-scratch-python-2db2fc52e5c2#d7d8

#### Cost Function

svm loss = 0.5 * ||W||^2  + loss

svm optimization function = y*(X.W) >= 1 ==> (1 - y*(X.W)) <= 0


#### Cost Gradient

grad = w -> when max(0, yi * (w * xi) = 0

w - lambda * yi * xi --> when not zero

#### SGD
update w = w - learning_rate * grad

In [None]:
learning_rate = 0.000001
lmda = 10000

def compute_cost(W, X, Y):
    # Calculate hinge losss = 
    N = X.shape[0]
    distance = 1 - Y * (np.dot(X, W))
    distance[distance < 0] = 0.0
    hinge_loss = lmda * np.sum(distance) / N
    
    # Cost
    cost = 0.5 * np.dot(W, W) + hinge_loss
    return cost

def cost_gradient(W, X_batch, Y_batch):
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])
    
    dist = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))

    for indx, d in enumerate(dist):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (lmda * Y_batch[indx] * X_batch[indx])
        dw += di
    
    dw /= len(Y_batch)
        
    return dw

def sgd(features, 
        output, 
        num_epoch):
    
    weights = np.zeros(features.shape[1])
    
    n = 0
    prev_cost = float("inf")
    cost_threshold = 0.01
    for epoch in range(num_epoch):
        X, Y = shuffle(features, output)
        for ind, x in enumerate(X):
            ascent = cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)
        
        # stopping condition
        if epoch == 2 ** n or epoch == num_epoch - 1:
            cost = compute_cost(weights, X, Y)
            cost_diff = abs(prev_cost - cost)
            print(f'epoch: {epoch}, prev_cost: {prev_cost}, new_cost: {cost}, diff: {cost_diff}')
            
            if cost_diff < cost_threshold * prev_cost:
                return weights
            
            prev_cost = cost
            n += 1
            
    return weights

def test_model(X_test, y_test, W):
    # testing the model on test set
    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(W, X_test.to_numpy()[i])) #model
        y_test_predicted = np.append(y_test_predicted, yp)
    print("accuracy on test dataset: {}".format(accuracy_score(y_test.to_numpy(), y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test.to_numpy(), y_test_predicted)))
    print("precision on test dataset: {}".format(precision_score(y_test.to_numpy(), y_test_predicted)))

### Running Model

In [None]:
weights = sgd(X_train.to_numpy(), y_train.to_numpy(), 5000)

In [None]:
test_model(X_test, y_test, weights)