In [65]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as qda
import time
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Gaussian Discriminate Analysis

Here we implement GDA and test it's predicting ability against other methods on the cancer dataset from sklearn.

In [34]:
cancer = load_breast_cancer()
cd = pd.DataFrame(cancer.data)
ct = pd.DataFrame(cancer.target)

# 2.

In [212]:
class GDA:
    def __init__(self, priors=None,tol=1.0e-3):
        self.tol = tol
        self.priors = np.asarray(priors) if priors is not None else None
        
    def fit(self,X,y):
        self.classes_, y = np.unique(y, return_inverse=True)
        n_samples, n_features = X.shape
        n_classes = len(self.classes_)
        if self.priors is None:
            self.priors_ = np.bincount(y) / float(n_samples)
        else:
            self.priors_ = self.priors
        means = []
        cov = []
        rotations = []
        scalings = []
        for i in range(n_classes):
            Xg = X[y == i, :]
            meang = Xg.mean(0)
            means.append(meang)
            Xgc = Xg - meang
            U, S, Vt = np.linalg.svd(Xgc, full_matrices=False)
            rank = np.sum(S > self.tol)
            S2 = (S ** 2) / (len(Xg) - 1)
            cov.append(np.dot(S2 * Vt.T, Vt))
            scalings.append(S2)
            rotations.append(Vt.T)
        self.covariance_ = cov
        self.means_ = np.asarray(means)
        self.scalings_ = scalings
        self.rotations_ = rotations
        return self
    
    def helper(self,X):
        norm2 = []
        for i in range(len(self.classes_)):
            R = self.rotations_[i]
            S = self.scalings_[i]
            Xm = X - self.means_[i]
            X2 = np.dot(Xm, R * (S ** (-0.5)))
            norm2.append(np.sum(X2 ** 2, 1))
        norm2 = np.array(norm2).T
        u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
        return (-0.5 * (norm2 + u) + np.log(self.priors_))
    
    def predict_proba(self,X):
        dec_func = self.helper(X)
        if len(self.classes_) == 2:
            values = dec_func[:, 1] - dec_func[:, 0]
        else:
            values = dec_func
        # compute the likelihood of the underlying gaussian models
        # up to a multiplicative constant.
        likelihood = np.exp(values - values.max()[:, np.newaxis])
        # compute posterior probabilities
        return likelihood / likelihood.sum()[:, np.newaxis]
    
    def predict(self, X):
        d = self.helper(X)
        y_pred = self.classes_.take(d.argmax(1))
        return y_pred
    
    def score(self, X, y, sample_weight=None):
        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)

# 3.

In [216]:
X_train, X_test, y_train, y_test = tts(cancer.data,cancer.target,test_size=0.33)
gda = GDA().fit(X_train,y_train)
gda.predict(X_test)
print(gda.score(X_test,y_test))

0.973404255319149


In [217]:
g=GaussianNB().fit(X_train,y_train)
print(g.score(X_test,y_test))

0.9468085106382979


In [218]:
clf = LogisticRegression(solver = 'liblinear').fit(X_train,y_train)
print(clf.score(X_test,y_test))

0.9627659574468085


The logistic regression and my code looks to get a much better score than naive bayes. This mean the data is probably linear or the error tolerances aren't all the same

# 4.

In [219]:
X_train, X_test, y_train, y_test = tts(cancer.data,cancer.target,test_size=0.33)
start = time.time()
q=qda().fit(X_train,y_train)
qscore = q.score(X_test,y_test)
print("sklearn time: {}".format(time.time()-start))
start = time.time()
g=GDA().fit(X_train,y_train)
gscore = g.score(X_test,y_test)
print("My time: {}".format(time.time()-start))
print("My score is np allclose to sklearn? {}".format(np.allclose(qscore,gscore)))

sklearn time: 0.008606672286987305
My time: 0.0077512264251708984
My score is np allclose to sklearn? True
