 # **Gaussian Discriminant Analysis**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
np.random.seed(100)

In [2]:
class GDA():
    
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.labels = np.unique(y)
        self.phi = np.zeros((len(np.unique(y)), 1))
        self.mu = np.zeros((len(np.unique(y)), x.shape[1]))
        self.sigma = 0
        
    def get_mu(self):
        for i in range(len(self.labels)):
            indexes = np.flatnonzero(self.y == self.labels[i])
            self.mu[i] = (self.x[indexes]).mean(axis=0)
        return self.mu
    
    def get_sigma(self):
        n = len(self.y)

        for i in range(len(self.x)):
            if self.y[i] == 0:
                diff = self.x[i].reshape(-1,1) - self.mu[0].reshape(-1,1)
                self.sigma += np.dot(diff, diff.T)
            elif self.y[i] == 1:
                diff = self.x[i].reshape(-1,1) - self.mu[1].reshape(-1,1)
                self.sigma += np.dot(diff, diff.T)
                
        return self.sigma/n
   
    def get_phi(self):
        labels = np.unique(self.y)
        phi = np.zeros(len(labels))
        for i,l in enumerate(labels):
            phi[i] = (self.y == l).mean()
        return phi
    
    def get_px(self, x, mu, sigma):
        dim = len(mu)
        c = (1./np.sqrt(abs((2*np.pi)**(dim) * (np.linalg.det(sigma)))))
        exp = np.dot((x.reshape(-1,1)- mu.reshape(-1,1)).T,np.dot(np.linalg.inv(sigma),(x.reshape(-1,1)- mu.reshape(-1,1))))
        return c * np.exp(-0.5 * exp)
    
    def predict(self,x, mu, sigma, phi):
        
        y_hat = np.zeros(len(x))
        for i in range(len(x)):
            prob = []
            for c in range(len(phi)):
                r = self.get_px(x[i], mu[c], sigma) * phi[c]
                prob.append(r)
            y_hat[i] = np.argmax(prob)
            
        return y_hat

In [3]:
df = pd.read_csv('microchip-data.csv', header=None, names=['test1','test2','result'])
df.head()

Unnamed: 0,test1,test2,result
0,0.051267,0.69956,1
1,-0.092742,0.68494,1
2,-0.21371,0.69225,1
3,-0.375,0.50219,1
4,-0.51325,0.46564,1


In [4]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1:].values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

In [6]:
model = GDA(X_train, y_train)

In [7]:
mu = model.get_mu()
sigma = model.get_sigma()
phi = model.get_phi()
y_pred = model.predict(X_test, mu, sigma, phi)

In [8]:
y_pred

array([1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1.,
       0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
       1., 1.])

In [9]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy*100)

Accuracy:  50.0


## GDA With Box Muller Transformation

In [10]:
def BoxMullerTransformation(r1, r2):
    z1 = np.sqrt(-2 * np.log(r1)) * np.cos(2 * np.pi * r2)
    z2 = np.sqrt(-2 * np.log(r1)) * np.sin(2 * np.pi * r2)
    
    return z1, z2

In [11]:
X = df.iloc[:, :-1]
Y = df.iloc[:, -1:].values

In [12]:
from sklearn.preprocessing import MinMaxScaler

X[['test1', 'test2']] = MinMaxScaler().fit_transform(X[['test1', 'test2']])

X.test1 = X.test1.replace(0, X.test1.median())
X.test2 = X.test2.replace(0, X.test2.median())
X = X.values

In [13]:
r1, r2 = BoxMullerTransformation(X[:,0], X[:,1])

X_new = np.stack((r1, r2), axis=-1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_new, Y, test_size = 0.3)

In [15]:
model = GDA(X_train, y_train)

In [16]:
mu = model.get_mu()
sigma = model.get_sigma()
phi = model.get_phi()
y_pred = model.predict(X_test, mu, sigma, phi)

In [17]:
y_pred

array([0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1.,
       1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1.,
       0., 1.])

In [18]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy*100)

Accuracy:  69.44444444444444
