In [474]:
import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [475]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from random import seed
from random import random
seed(1)

Get our data and get a preview

In [476]:
bankruptcy_df = pd.read_csv(r'bankruptcy.csv',header =0)
bankruptcy_df.head()

Unnamed: 0,Firm,SALES,ROCE,FFTL,GEAR,CLTA,CACL,QACL,WCTA,LAG,AGE,CHAUD,BIG6,FAIL
0,o1,6762,7.5364,0.1545,0.6233,0.6233,1.5489,0.7356,0.3422,96,74,0,0,0
1,o2,16149,-1.0712,0.0271,1.2218,1.2218,0.6236,0.3153,-0.4599,287,29,0,1,0
2,o3,8086,15.2024,0.6163,0.3307,0.3307,2.3553,1.7513,0.4482,64,51,0,1,0
3,o4,7646,31.2239,0.6312,0.5205,0.4829,1.6397,1.4935,0.3089,286,25,0,0,0
4,o5,36067,10.9613,0.354,0.3786,0.3786,1.5852,1.1626,0.2216,301,33,0,1,0


Looking at a description of the data

In [477]:
bankruptcy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Firm    60 non-null     object 
 1   SALES   60 non-null     int64  
 2   ROCE    60 non-null     float64
 3   FFTL    60 non-null     float64
 4   GEAR    60 non-null     float64
 5   CLTA    60 non-null     float64
 6   CACL    60 non-null     float64
 7   QACL    60 non-null     float64
 8   WCTA    60 non-null     float64
 9   LAG     60 non-null     int64  
 10  AGE     60 non-null     int64  
 11  CHAUD   60 non-null     int64  
 12  BIG6    60 non-null     int64  
 13  FAIL    60 non-null     int64  
dtypes: float64(7), int64(6), object(1)
memory usage: 6.7+ KB


Balanced class distribution is so we don't need to resample

In [478]:
bankruptcy_df[bankruptcy_df['FAIL'] ==0] = -1
bankruptcy_df.groupby(['FAIL'])[['FAIL']].count()

Unnamed: 0_level_0,FAIL
FAIL,Unnamed: 1_level_1
-1,30
1,30


Summary of attributes

In [479]:
bankruptcy_df.describe()

Unnamed: 0,SALES,ROCE,FFTL,GEAR,CLTA,CACL,QACL,WCTA,LAG,AGE,CHAUD,BIG6,FAIL
count,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
mean,5673.0,-1.837925,-0.465092,-0.081783,-0.148318,0.005928,-0.163737,-0.519848,134.066667,9.916667,-0.35,-0.316667,0.0
std,10402.93709,7.888941,0.547884,0.937403,0.873916,1.045462,0.874284,0.509825,144.24448,18.584021,0.732421,0.770025,1.008439
min,-1.0,-31.254,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-1.0,-1.708625,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,1429.0,-1.0,-0.66415,-0.3051,-0.3507,-0.2513,-0.35765,-0.8735,42.5,0.5,-0.5,-0.5,0.0
75%,5780.0,-1.0,0.06465,0.80305,0.701225,0.925075,0.55935,-0.045675,295.0,14.0,0.0,0.0,1.0
max,48162.0,26.5006,0.4277,1.4865,1.4865,2.0674,1.8493,0.4084,393.0,90.0,1.0,1.0,1.0


Data Preparation. We split into train and test datasets, with 80% going to training and 20% going to testing

In [480]:
from sklearn.model_selection import train_test_split
X = bankruptcy_df[bankruptcy_df.columns.difference(['Firm', 'FAIL'])]
y = bankruptcy_df[['FAIL']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12346789)

Remove mean and scale to unit variance z=(x-u)/s of both train and test

In [481]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Finally, we finish our data preparation by selecting only the K (in our case k=12) best features.

In [482]:
from sklearn.feature_selection import SelectKBest
k = 10
prep = SelectKBest(k=k)
X_train = prep.fit_transform(X_train, y_train.values.flatten())
X_test = prep.fit_transform(X_test, y_test.values.flatten())

The following is a SVM model from scratch

In [483]:
import csv
import numpy as np
import math
import cvxopt as opt
from cvxopt import matrix, solvers
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [484]:
def kernel(XTest, XTrain, type=0, sigma=0):
    # Write your code below
    if type == 0: # type is linear
      K = np.matmul(XTest, XTrain.T)
    elif type == 1: # type is polynomial
      K = np.power((np.matmul(XTest, XTrain.T) + 1), sigma) # sigma is d here
    elif type == 2: # type is RBF
      # ||Xi - Xj||^2 = (Xi - Xj).T (Xi - Xj) = Xi.T X + Xj.T Xj - 2Xi.T Xj
      K = np.exp(-np.linalg.norm(XTrain[:, None] - XTest, axis=2) ** 2 / (2 * sigma ** 2))
    else:
        raise ValueError("Invalid kernel type")
    return K

def predict(XTest, XTrain, yTrain, alpha):
    # Write your code below
    pred = np.zeros((XTest.shape[0], 1))

    b = intercept(XTest, XTrain, yTrain, alpha)
    for j in range(XTest.shape[0]):
      pred[j] = np.sum(alpha * yTrain * kernel(XTest[j], XTrain, type, sigma)) + b
    pred[pred < 0] = -1
    pred[pred >= 0] = 1

    return pred

def intercept(XTest, XTrain, yTrain, alpha):
    # Write your code below
    sv = (alpha > 0).flatten()
    sv_y = yTrain[sv]
    sv_a = alpha[sv]
    sv_xi = XTrain[sv]
    b = np.mean(sv_y - np.sum(sv_a * sv_y * kernel(sv_xi, sv_xi, type, sigma)))
    return b

sigma = 2
type = 0

# Linear Kernel
TrainSize = X_train.shape[0]
K = kernel(X_train, X_train, type, sigma)
a0 = np.random.randn(TrainSize)

# Inequality that individual alpha>=0
G = matrix(np.eye(TrainSize))
h = matrix(np.zeros(TrainSize))

# Equality that sum(alpha_i*y_i)=0
A = opt.matrix(np.double(y_train))
b = matrix(0.0)

# Change from min to max optimization by multiplying with -1
# Regularization term to force H positive definite
ymat = np.diag(y_train.values.flatten())
P = 0.5 * np.dot(ymat, np.dot(K, ymat)) +  1e-5 * np.identity(TrainSize)
q = opt.matrix(-np.ones((TrainSize,1)))

opts = {'maxiters':100000}
solvers.options['show_progress'] = False
sol = solvers.qp(matrix(P), q, G, h, A.T, b, initvals = a0, options=opts) # solvers.qp to solve the optimization
print(sol['status'])
alpha = np.array(sol['x'])

pred = predict(X_test, X_train, y_train.values, alpha)
print('Accuracy: %f\n' % (np.mean(pred == y_test)*100))

     pcost       dcost       gap    pres   dres
 0:  0.0000e+00  0.0000e+00  5e+01  7e+00  0e+00
 1:  0.0000e+00 -5.5511e-17  5e-01  7e-02  1e-16
 2: -1.7528e-17  8.6736e-19  5e-03  7e-04  1e-16
 3: -3.2574e-19  0.0000e+00  5e-05  7e-06  5e-17
 4: -3.1716e-21 -1.0588e-22  5e-07  7e-08  4e-17
 5: -4.1778e-23 -8.2718e-25  5e-09  7e-10  4e-17
Optimal solution found.
optimal
Accuracy: 50.000000



Discussed with groups 2 and 3. We all seemed to get 50%. This was because the alpha coefficients were significantly smaller than the rest of the values making the intercept (b) be the only thing effecting the end prediction.