In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sklearn
sklearn.__version__

In [None]:
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score

Loading data as usual, special thanks to @criskiev for providing the training labels before the infamous "flip".

In [None]:
train_data =  pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
train_data_0 = pd.read_csv('../input/november21/train.csv')
test_data =  pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
X = train_data.drop('target',axis=1).set_index('id')
y = train_data.target
y0 = train_data_0.target
X_test = test_data.set_index('id')

In [None]:
del train_data, train_data_0, test_data
gc.collect()

In [None]:
ss = StandardScaler().fit(X)
X = pd.DataFrame(ss.transform(X),index=X.index,columns=X.columns)
X_test = pd.DataFrame(ss.transform(X_test),index=X_test.index,columns=X_test.columns)

If we fitted a hard margin SVM with all the samples, there would be too many (600000) constraints for the cvxopt solver. Instead, we build a soft margin SVM first as a first approximation of the decision boundary, and take only 3% of the samples around this approximate boundary.

In [None]:
clf = LinearSVC(C=1e6, dual=False, tol=1e-6, max_iter=100000,random_state=42).fit(X,y0)
accuracy_score(y0,clf.predict(X))

In [None]:
scores = clf.decision_function(X)
pc = 3.0
lo=np.percentile(scores[scores<0],100-pc/2)
hi=np.percentile(scores[scores>=0],pc/2)
mask = np.logical_and(scores>lo,scores<hi)
X_b, y0_b = X[mask], y0[mask]

In [None]:
pip install cvxopt

In [None]:
import cvxopt

Now we solve the dual form of the hard margin SVM optimization problem. The following function is heavily based on the implementation of Xavier Bourret Sicotte.

[https://xavierbourretsicotte.github.io/SVM_implementation.html](https://xavierbourretsicotte.github.io/SVM_implementation.html)

In [None]:
def hard_margin_svm(X,y):
    X = X.to_numpy()
    y = y.to_numpy().astype(np.float64)
    
    y = (2*y - 1).reshape(-1,1) # convert to +/- 1 target representation
    
    m,n = X.shape
    X_1 = y * X
    H = np.dot(X_1 , X_1.T) 


    P = cvxopt.matrix(H)
    q = cvxopt.matrix(-np.ones((m, 1)))
    G = cvxopt.matrix(-np.eye(m))
    h = cvxopt.matrix(np.zeros((m,1)))
    A = cvxopt.matrix(y.reshape(1, -1))
    b = cvxopt.matrix(np.zeros(1))

    cvxopt.solvers.options['show_progress'] = True
    cvxopt.solvers.options['abstol'] = 1e-8
    cvxopt.solvers.options['reltol'] = 1e-8
    cvxopt.solvers.options['feastol'] = 1e-8

    solution = cvxopt.solvers.qp(P, q, G, h, A, b)
    alpha = np.array(solution['x']) 
    
    w = ((y * alpha).T @ X).reshape(-1,1)
    S = (alpha > 1e-4).flatten()
    b = np.mean(y[S] - np.dot(X[S], w))
    
    return alpha, w, b
    

Now just run it already!

In [None]:
alpha, w, b = hard_margin_svm(X_b,y0_b)

Convergence sooner than I thought! Now we need to write a little function to emulate the function with the same name in LinearSVC. This is not probability, but for AUC scoring purpose, this would suffice as a ranking function.

In [None]:
def decision_function(w,b,X):
    return (X.to_numpy()@w+b).reshape(-1,) 

Something we need to check before moving on. We only fitted the classifier with 3% of the data around the approximate decision boundary. How do we know that the hard margin constraints, viz., $$\tilde{y}_i(x_i\cdot w+b)\ge1$$ are satisfied for the rest of the samples? We need to check this manually.

In [None]:
min(decision_function(w,b,X)*(2*y0.to_numpy()-1))

Okay, this is practically 1. We have indeed found the hard margin SVM. And the margin width is given by $\frac{2}{||w||}$.

In [None]:
2/np.linalg.norm(w)

For perspective, all the features have been normalized to stdev 1. So the two classes can be linearly separated, but barely -- the margin is tiny.

Next, what is the training accuracy on the original, unaltered labels? We know it should be 1 because the hard margin SVM separates the two classes by definition. Let's double check that.

In [None]:
accuracy_score(y0,decision_function(w,b,X)>0)

How does the training AUC score fare on the altered labels?

In [None]:
roc_auc_score(y,decision_function(w,b,X))


Finally, let's make a submission.

In [None]:
scores = decision_function(w,b,X_test)

In [None]:
pd.DataFrame({'id': X_test.index, 'target': scores}).to_csv('submission.csv', index=False)
print("Submission saved!")