In [None]:
import numpy as np
from scipy.special import expit
from scipy.optimize import fmin_bfgs
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv('star_classification.csv')

In [None]:
df

In [None]:
# Get rid of all rows which have class as GALAXY
df = df[df['class'] != 'GALAXY']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('class', axis=1), df['class'], test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# encode the labels:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [None]:
class LogisticRegressionBase:
    def __init__(self, eta=0.1, iterations=20, C1=0.0001, C2 = 0.0001):
        self.eta = eta
        self.iters = iterations
        self.C1 = C1
        self.C2 = C2
    def __str__(self):
        if(hasattr(self, 'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
    @staticmethod
    def _add_bias(X):   
        return np.hstack((np.ones((X.shape[0], 1)), X))

    @staticmethod
    def _sigmoid(theta):
        return expit(theta)
    def _get_gradient(self, X, y):
        ydiff = y - self.predict_proba(X, add_bias = False).ravel()
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0)
        gradient = gradient.reshape(self.w_.shape)
        return gradient
    def _get_gradient_L2(self, X, y):
        gradient = self._get_gradient(X, y)
        gradient[1:] += -2 * self.w_[1:] * self.C2
        return gradient
    def _get_gradient_L1(self, X, y):
        gradient = self._get_gradient(X, y)
        l1_der = self.w_[1:] / np.abs(self.w_[1:])
        l1_der[self.w_[1:] == 0] = 0
        gradient[1:] +=  -1 * l1_der * self.C1
        return gradient
    def _get_gradient_elastic(self, X, y):
        gradient = self._get_gradient(X, y)
        l1_der = self.w_[1:] / np.abs(self.w_[1:])
        l1_der[self.w_[1:] == 0] = 0
        gradient[1:] +=  -1 * l1_der * self.C1
        gradient[1:] += -2 * self.w_[1:] * self.C2
        return gradient
    def predict_proba(self, X, add_bias=True):
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.w_)

    def predict(self, X):
        return self.predict_proba(X) > 0.5
    def fit(self, X, y, regularization=None):
        Xb = self._add_bias(X)
        num_samples, num_features = Xb.shape
        self.w_ = np.random.uniform(-1, 1, (num_features, 1))
        for i in range(self.iters):
            if(regularization == 'L1'):
                grad = self._get_gradient_L1(Xb, y)
            elif(regularization == 'L2'):
                grad = self._get_gradient_L2(Xb, y)
            elif(regularization == 'elastic'):
                grad = self._get_gradient_elastic(Xb, y)
            else:
                grad = self._get_gradient(Xb, y)
            self.w_ += grad*self.eta

In [None]:
model = LogisticRegressionBase(eta=0.1, iterations=1000)
model.fit(X_train, y_train, regularization='elastic')
y_pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))

In [None]:
class LogisticRegressionSGD(LogisticRegressionBase):
    def _get_gradient(self, X, y):
        sample = int(np.random.rand()  * len(y))
        ydiff = y[sample] - self.predict_proba(X[sample],add_bias=False)
        gradient = X[sample] * ydiff[:, np.newaxis]
        gradient = gradient.reshape(self.w_.shape)

        return gradient

In [None]:
class LogisticRegressionNewtons(LogisticRegressionBase):
    def _get_gradient(self, X, y):
        g = self.predict_proba(X, add_bias=False).ravel()
        hessian = X.T @ np.diag(g * (1-g)) @ X
        ydiff = y - g
        gradient = np.sum(X * ydiff[:, np.newaxis], axis=0)
        gradient = gradient.reshape(self.w_.shape)
        return np.linalg.pinv(hessian) @ gradient
    def _get_gradient_L1(self, X, y):
        g = self.predict_proba(X, add_bias=False).ravel()
        hessian = X.T @ np.diag(g * (1-g)) @ X # the second derivative of abs(x) evaluates to 0 so our hessian will simply be the one for the ordinary log likelihood
        ydiff = y - g
        gradient = np.sum(X @ ydiff[:, np.newaxis], axis=0)
        gradient = gradient.reshape(self.w_.shape)
        l1_der = self.w_[1:] / np.abs(self.w_[1:])
        l1_der[self.w_[1:] == 0] = 0
        gradient[1:] += -1 * l1_der[1:] * self.C1
        return np.linalg.pinv(hessian) @ gradient
    def _get_gradient_L2(self, X, y):
        g = self.predict_proba(X, add_bias=False).ravel()
        hessian = X.T @ np.diag(g * (1-g)) @ X - 2 * self.C2
        ydiff = y - g
        gradient = np.sum(X * ydiff[:, np.newaxis], axis=0)
        gradient = gradient.reshape(self.w_.shape)
        gradient[1:] += -2 * self.w_[1:] * self.C2
        return np.linalg.pinv(hessian) @ gradient
    def _get_gradient_elastic(self, X, y):
        g = self.predict_proba(X, add_bias=False).ravel()
        hessian = X.T @ np.diag(g * (1-g)) @ X - 2 * self.C2
        ydiff = y - g
        gradient = np.sum(X @ ydiff[:, np.newaxis], axis=0)
        gradient = gradient.reshape(self.w_)
        l1_der = self.w_ / np.abs(self.w_)
        gradient[1:] += -1 * l1_der[1:] * self.C1
        gradient[1:] += -2 * self.w_[1:] * self.C2
        return np.linalg.pinv(hessian) @ gradient


In [None]:
model = LogisticRegressionNewtons(eta=0.1, iterations=10)
model.fit(X_train, y_train, regularization='L2')
y_pred = model.predict(X_test)
print(model)
print('Accuracy: ', accuracy_score(y_test, y_pred))

In [None]:
from numpy import ma
from scipy.optimize import fmin_bfgs
class BFGSLogisticRegression(LogisticRegressionBase):
    @staticmethod
    def objective_function(w,X,y,C1,C2):
        g = expit(X @ w)
        return -np.sum(ma.log(g[y==1]))-np.sum(ma.log(1-g[y==0])) + C2*sum(w**2) + C1*sum(np.abs(w))

    @staticmethod
    def objective_gradient(w,X,y,C1, C2):
        g = expit(X @ w)
        ydiff = y-g
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0)
        gradient = gradient.reshape(w.shape)
        gradient[1:] += -2 * w[1:] * C2
        l1_der = w[1:] / np.abs(w[1:])
        l1_der[w[1:] == 0] = 0
        gradient[1:] +=  -1 * l1_der * C1

        return -gradient


    def fit(self, X, y, regularization=None):
        Xb = self._add_bias(X)
        num_samples, num_features = Xb.shape
        #modifying regularization params so we can use one objective function and gradient in all cases
        if(regularization == 'L1'):
            self.C2 = 0
        elif(regularization == 'L2'):
            self.C1 = 0
        elif(regularization == 'elastic'):
            pass
        else:
            self.C1 = 0
            self.C2 = 0

        self.w_ = fmin_bfgs(self.objective_function,
                        np.zeros((num_features,1)),
                        fprime=self.objective_gradient,
                        args=(Xb,y, self.C1, self.C2),
                        gtol=1e-03,
                        maxiter=self.iters,
                        disp=False)

        self.w_ = self.w_.reshape((num_features,1))


In [None]:
bfgslr = BFGSLogisticRegression(iterations=200, C2=0.001, C1=0.001)
bfgslr.fit(X_train, y_train, regularization='elastic')
print(bfgslr)
print(accuracy_score(y_test, bfgslr.predict(X_test)))

In [None]:
class BFGSFromScratchLogisticRegression(LogisticRegressionBase):


    def fit(self, X, y, regularization=None):
        if(regularization == 'L1'):
            self.C2 = 0
        elif(regularization == 'L2'):
            self.C1 = 0
        elif(regularization == 'elastic'):
            pass
        else:
            self.C1 = 0
            self.C2 = 0
        Xb = self._add_bias(X)
        num_samples, num_features = Xb.shape
        self.w_ = np.zeros((num_features, 1))
        self.inv_hessian = np.identity(num_features)
        self.last_grad = np.zeros((num_features,1))
        g = self.predict_proba(Xb, add_bias=False).ravel()
        ydiff = y-g
        self.last_grad = np.sum(Xb * ydiff[:, np.newaxis], axis=0)
        self.last_grad = self.last_grad.reshape((num_features, 1))
        self.last_grad[1:] += -2 * self.w_[1:] * self.C2
        l1_der = self.w_[1:] / np.abs(self.w_[1:])
        l1_der[self.w_[1:] == 0] = 0
        self.last_grad[1:] +=  -1 * l1_der * self.C1
        self.last_grad = -self.last_grad

        for i in range(self.iters):
            pk = -np.dot(self.inv_hessian, self.last_grad)
            pk = pk.reshape((num_features, 1))
            sk = self.eta * pk
            self.w_ += sk
            g = self.predict_proba(Xb, add_bias=False).ravel()
            ydiff = y-g
            curr_grad = np.sum(Xb * ydiff[:, np.newaxis], axis=0)
            curr_grad = curr_grad.reshape((num_features, 1))
            curr_grad[1:] += -2 * self.w_[1:] * self.C2
            l1_der = self.w_[1:] / np.abs(self.w_[1:])
            l1_der[self.w_[1:] == 0] = 0
            curr_grad[1:] +=  -1 * l1_der * self.C1
            curr_grad = -curr_grad
            vk = curr_grad - self.last_grad
            inv_hessian_num_1_1 = (sk.T @ vk) + self.inv_hessian
            inv_hessian_num_1_2 = sk @ sk.T
            inv_hessian_num_1 = inv_hessian_num_1_1 @ inv_hessian_num_1_2
            inv_hessian_dom_1 = (sk.T @ vk) ** 2
            inv_hessian_1 = inv_hessian_num_1 / inv_hessian_dom_1
            inv_hessian_num_2 = (self.inv_hessian @ vk @ sk.T) + (sk @ vk.T @ self.inv_hessian)
            inv_hessian_dom_2 =  sk.T @ vk
            inv_hessian_2 = inv_hessian_num_2 / inv_hessian_dom_2
            self.inv_hessian += inv_hessian_1 - inv_hessian_2
            self.last_grad = curr_grad




In [None]:
test_model = BFGSFromScratchLogisticRegression(iterations=100, eta=0.001)
test_model.fit(X_train, y_train, regularization='elastic')
print(test_model)
print(accuracy_score(y_test, test_model.predict(X_test)))

In [None]:
class LogisticRegression:
    def __init__(self, eta=.1, iters=10, C1=.001, C2=.0001, solver="default", regularization=None):
        self.eta = eta
        self.iters = iters
        self.C1 = C1
        self.C2 = C2
        self.solver = solver
        self.classifiers = []
        self.regularization = regularization

    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'

    def fit(self, X, y):
        # Get number of unique values of y
        unique_classes = np.unique(y)
        unique_classes.sort()
        for target in unique_classes:
            # Transform the data into binary classification, the taget class vs the rest
            y_binary = np.where(y == target, 1, 0)
            if self.solver == "default":
                model = LogisticRegressionBase(iterations=self.iters, eta=self.eta, C1=self.C1, C2=self.C2)
            elif self.solver == "sgd":
                model = LogisticRegressionSGD(iterations=self.iters, eta=self.eta, C1=self.C1, C2=self.C2)
            elif self.solver == "newton":
                model = LogisticRegressionNewtons(iterations=self.iters, eta=self.eta, C1=self.C1, C2=self.C2)
            elif self.solver == 'bfgs':
                model = BFGSLogisticRegression(iterations=self.iters, eta=self.eta, C1=self.C1, C2=self.C2)
            elif self.solver == 'bfgs_scratch':
                model = BFGSFromScratchLogisticRegression(iterations=self.iters, eta=self.eta, C1=self.C1, C2=self.C2)
            model.fit(X, y_binary, regularization=self.regularization)
            self.classifiers.append(model)
        self.w_ = np.hstack([x.w_ for x in self.classifiers]).T
    def predict_proba(self, X):
        probs = []
        for model in self.classifiers:
            probs.append(model.predict_proba(X).reshape(len(X), 1))
        return np.hstack(probs)

    def predict(self, X):
        probs = self.predict_proba(X)
        return np.argmax(probs, axis=1)


In [None]:
model = LogisticRegression(iters=150000, eta=1, solver="sgd", regularization="l2")
model.fit(X_train, y_train)
%time
y_hat = model.predict(X_test)

In [None]:

# Calculate accuracy of y_hat vs y_test with sklearn
print('Accuracy: ', accuracy_score(y_test, y_hat))

In [None]:
model = LogisticRegression(iters=10, eta=.1, solver="newton", regularization="l2")
model.fit(X_train, y_train)
%time
y_hat_new = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_hat_new))

In [None]:
from sklearn.linear_model import LogisticRegression as SKLogisticRegression
SKmodel = SKLogisticRegression(solver='lbfgs', penalty='l2', max_iter=1000)
SKmodel.fit(X_train, y_train)
%time
print(accuracy_score(y_true=y_test, y_pred=SKmodel.predict(X_test)))

In [None]:
# Grid Search for best parameters
params = {"C": [.001, .01, .1], 'iters': [5, 50, 500, 5000], 'solver':['sgd', 'bfgs', "default"]}

from sklearn.model_selection import ParameterGrid
# create all combos of params:
param_combos = list(ParameterGrid(params))

accuracies = []

for params in param_combos:
    model = LogisticRegression(iters=params["iters"], eta=.1, C1=params['C'], C2=params['C'], regularization="elastic", solver=params['solver'])
    model.fit(X_train, y_train)
    y_hat_new = model.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_hat_new))

for i in range(len(param_combos)):
    print("Accuracy for", param_combos[i], "is", accuracies[i])

# Print out the grid search results in a table with C as the row and iters as the column


The columns represent the number of iterations and the rows represent the regularization terms. We decided to use 5,50,500 and 5000 iterations for the regularizations terms for elasticnet regression .0001, 0.01, and .1. We could've tried using L1 or L2 regularization seperately but felt that ElasticNet Regression gave us the best bang for our buck, and for simplicity, just to keep the regularization constant C for each as the same value.  

In [None]:
param_combo_accuracies = []
for i in range(len(param_combos)):
    param_combo_accuracies.append({"C": param_combos[i]["C"], "iters": param_combos[i]["iters"], "solver": param_combos[i]["solver"], "accuracy": accuracies[i]})

param_combo_accuracies.sort(key=lambda x: x["accuracy"], reverse=True)

for i in range(5):
    print(param_combo_accuracies[i])

In [66]:
test_model = BFGSFromScratchLogisticRegression(iterations=100, eta=0.001)
test_model.fit(X_train, y_train, regularization='elastic')
print(test_model)
print(accuracy_score(y_test, test_model.predict(X_test)))

Binary Logistic Regression Object with coefficients:
[[ -8.88627572]
 [  5.27509514]
 [  0.90772017]
 [ -0.53636115]
 [  4.44592337]
 [  2.95011535]
 [ -9.16086727]
 [-17.01320358]
 [  2.19833927]
 [  5.27584254]
 [  9.46448486]
 [  2.3352807 ]
 [  1.83822263]
 [ -1.66595187]
 [-89.71738737]
 [ -1.66584169]
 [  3.93691556]
 [  1.63839714]]
0.9562322771544816


In [38]:
class LogisticRegression:
    def __init__(self, eta=.1, iters=10, C1=.001, C2=.0001, solver="default", regularization=None):
        self.eta = eta
        self.iters = iters
        self.C1 = C1
        self.C2 = C2
        self.solver = solver
        self.classifiers = []
        self.regularization = regularization

    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'

    def fit(self, X, y):
        # Get number of unique values of y
        unique_classes = np.unique(y)
        unique_classes.sort()
        for target in unique_classes:
            # Transform the data into binary classification, the taget class vs the rest
            y_binary = np.where(y == target, 1, 0)
            if self.solver == "default":
                model = LogisticRegressionBase(iterations=self.iters, eta=self.eta, C1=self.C1, C2=self.C2)
            elif self.solver == "sgd":
                model = LogisticRegressionSGD(iterations=self.iters, eta=self.eta, C1=self.C1, C2=self.C2)
            elif self.solver == "newton":
                model = LogisticRegressionNewtons(iterations=self.iters, eta=self.eta, C1=self.C1, C2=self.C2)
            elif self.solver == 'bfgs':
                model = BFGSLogisticRegression(iterations=self.iters, eta=self.eta, C1=self.C1, C2=self.C2)
            elif self.solver == 'bfgs_scratch':
                model = BFGSFromScratchLogisticRegression(iterations=self.iters, eta=self.eta, C1=self.C1, C2=self.C2)
            model.fit(X, y_binary, regularization=self.regularization)
            self.classifiers.append(model)
        self.w_ = np.hstack([x.w_ for x in self.classifiers]).T
    def predict_proba(self, X):
        probs = []
        for model in self.classifiers:
            probs.append(model.predict_proba(X).reshape(len(X), 1))
        return np.hstack(probs)

    def predict(self, X):
        probs = self.predict_proba(X)
        return np.argmax(probs, axis=1)


In [None]:
model = LogisticRegression(iters=150000, eta=1, solver="sgd", regularization="l2")
model.fit(X_train, y_train)
%time
y_hat = model.predict(X_test)

In [None]:

# Calculate accuracy of y_hat vs y_test with sklearn
print('Accuracy: ', accuracy_score(y_test, y_hat))

In [269]:
model = LogisticRegression(iters=10, eta=.1, solver="newton", regularization="l2")
model.fit(X_train, y_train)
%time
y_hat_new = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_hat_new))

CPU times: user 3 µs, sys: 5 µs, total: 8 µs
Wall time: 4.77 µs
Accuracy:  0.814


In [245]:
from sklearn.linear_model import LogisticRegression as SKLogisticRegression
SKmodel = SKLogisticRegression(solver='lbfgs', penalty='l2', max_iter=1000)
SKmodel.fit(X_train, y_train)
%time
print(accuracy_score(y_true=y_test, y_pred=SKmodel.predict(X_test)))

CPU times: user 3 µs, sys: 4 µs, total: 7 µs
Wall time: 3.1 µs
0.95485


In [54]:
# Grid Search for best parameters
params = {"C": [.001, .01, .1], 'iters': [5, 50, 500, 5000], 'solver':['sgd', 'bfgs', "default"]}

from sklearn.model_selection import ParameterGrid
# create all combos of params:
param_combos = list(ParameterGrid(params))

accuracies = []

for params in param_combos:
    model = LogisticRegression(iters=params["iters"], eta=.1, C1=params['C'], C2=params['C'], regularization="elastic", solver=params['solver'])
    model.fit(X_train, y_train)
    y_hat_new = model.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_hat_new))

for i in range(len(param_combos)):
    print("Accuracy for", param_combos[i], "is", accuracies[i])

# Print out the grid search results in a table with C as the row and iters as the column


  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])
  l1_der = w[1:] / np.abs(w[1:])


Accuracy for {'C': 0.001, 'iters': 5, 'solver': 'sgd'} is 0.6700776722968808
Accuracy for {'C': 0.001, 'iters': 5, 'solver': 'bfgs'} is 0.9441499198619159
Accuracy for {'C': 0.001, 'iters': 5, 'solver': 'default'} is 0.29466157070644805
Accuracy for {'C': 0.001, 'iters': 50, 'solver': 'sgd'} is 0.8468746147207497
Accuracy for {'C': 0.001, 'iters': 50, 'solver': 'bfgs'} is 0.9749722598939712
Accuracy for {'C': 0.001, 'iters': 50, 'solver': 'default'} is 0.8926149673283196
Accuracy for {'C': 0.001, 'iters': 500, 'solver': 'sgd'} is 0.955369251633584
Accuracy for {'C': 0.001, 'iters': 500, 'solver': 'bfgs'} is 0.9749722598939712
Accuracy for {'C': 0.001, 'iters': 500, 'solver': 'default'} is 0.962643323881149
Accuracy for {'C': 0.001, 'iters': 5000, 'solver': 'sgd'} is 0.9779312045370484
Accuracy for {'C': 0.001, 'iters': 5000, 'solver': 'bfgs'} is 0.9749722598939712
Accuracy for {'C': 0.001, 'iters': 5000, 'solver': 'default'} is 0.9781777832573049
Accuracy for {'C': 0.01, 'iters': 5, 's

Unnamed: 0,5,50,500,5000
0.001,0.553076,0.84219,0.96252,0.978178
0.01,0.691283,0.874985,0.958451,0.96141
0.1,0.695722,0.949821,0.933177,0.933177


The columns represent the number of iterations and the rows represent the regularization terms. We decided to use 5,50,500 and 5000 iterations for the regularizations terms for elasticnet regression .0001, 0.01, and .1. We could've tried using L1 or L2 regularization seperately but felt that ElasticNet Regression gave us the best bang for our buck, and for simplicity, just to keep the regularization constant C for each as the same value.  

In [58]:
param_combo_accuracies = []
for i in range(len(param_combos)):
    param_combo_accuracies.append({"C": param_combos[i]["C"], "iters": param_combos[i]["iters"], "solver": param_combos[i]["solver"], "accuracy": accuracies[i]})

param_combo_accuracies.sort(key=lambda x: x["accuracy"], reverse=True)

for i in range(5):
    print(param_combo_accuracies[i])

{'C': 0.001, 'iters': 5000, 'solver': 'default', 'accuracy': 0.9781777832573049}
{'C': 0.001, 'iters': 5000, 'solver': 'sgd', 'accuracy': 0.9779312045370484}
{'C': 0.001, 'iters': 50, 'solver': 'bfgs', 'accuracy': 0.9749722598939712}
{'C': 0.001, 'iters': 500, 'solver': 'bfgs', 'accuracy': 0.9749722598939712}
{'C': 0.001, 'iters': 5000, 'solver': 'bfgs', 'accuracy': 0.9749722598939712}
