In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [185]:
class GDA():
    def __init__(self):
        
        
        self.X = None
        self.X0 = None #matrix of objects which are 0 class
        self.X1 = None #matrix of objects which are 1 class
        
        
        self.m = None
        
        #prior probs
        self.p0 = None
        self.p1 = None 
    
    def _get_vals(self, X, y):
        
        stack_data = np.column_stack((X,y))
        
        self.m = X.shape[0]
        
        #assigning correct data
        self.X = X
        self.X0 = stack_data[stack_data[: , -1] == 0, :-1]
        self.X1 = stack_data[stack_data[: , -1] == 1, :-1]
        
        #compute agregatted mean  vectors
        self.mu0_vec = np.mean(self.X0, axis=0)
        self.mu1_vec = np.mean(self.X1, axis=0)
        
        #compute prior probs
        self.p0 = self.X0.shape[0] / self.m
        self.p1 = self.X1.shape[0] / self.m
        
        
        
    def _covariance(self, arr1, arr2):
        
        mu1 = np.mean(arr1)
        mu2 = np.mean(arr2)
        
        covariance  = np.sum( np.multiply((arr1 - mu1), (arr2 - mu2)) ) / (len(arr1) - 1)
        
        return covariance
    
    def compute_cov_matrix(self):
        
        n = self.X.shape[1]
        
        #intializing empty matrix
        self.cov_matrix = np.zeros((n, n))
        
        #filling the matrix
        for i in range(n):
            for j in range(n):
                
                arr1 = self.X[:, i]
                arr2 = self.X[:, j]
                
                self.cov_matrix[i, j] = self._covariance(arr1, arr2)
                
    
    def multivarNorm_pdf(self, x_vec, mu_vec):

        n = self.X.shape[1]
        covar_inv = np.linalg.inv(self.cov_matrix)

        # Compute the square root of the determinant of the covariance matrix
        sqrt_det_cov = np.sqrt(np.linalg.det(self.cov_matrix))

        # Compute the normalization term (1 / (2*pi)^(n/2) / sqrt_det_cov)
        norm_term = 1.0 / (((2 * np.pi) ** (n / 2.0)) * sqrt_det_cov)

        # Compute the exponent part of the formula
        x_minus_mu = x_vec - mu_vec
        exponent_term = np.exp(-0.5 * np.dot(x_minus_mu.T, np.dot(covar_inv, x_minus_mu)))

        # Return the full multivariate normal probability density
        return norm_term * exponent_term

    
    def predict(self, X):
        
        n = X.shape[0]
        
        predictions = []
        
        for i in range(n):
            
            #using Bayesian rule to compute probs
            
            p_belong0 = self.p0 * self.multivarNorm_pdf(X[i], self.mu0_vec)
            p_belong1 = self.p1 * self.multivarNorm_pdf(X[i], self.mu1_vec)
            
            predictions.append(int(p_belong1 >= p_belong0))
        
        return np.array(predictions)
    
    def fit(self, X, y):
        
        self._get_vals(X, y)
        self.compute_cov_matrix()

## Trying implementation on  simple dataset

In [203]:
df = pd.read_csv("synthetic_fraud_data.csv")

df = df.sample(frac=1, random_state=2).reset_index(drop=True)

train_df = df[:130]
test_df = df[130:]

X_train = train_df.drop("Label", axis=1).values
X_test = test_df.drop("Label", axis=1).values

y_train = train_df["Label"].values
y_test = test_df["Label"].values


In [204]:
gda = GDA()
gda.fit(X_train, y_train)

y_est = gda.predict(X_test)

In [202]:
from sklearn.metrics import accuracy_score


accuracy_score(y_test, y_est)

1.0

## Trying GDA on real data

In [213]:
heart_df = pd.read_csv("heart.csv")

X = heart_df.drop("target", axis=1).values
y = heart_df["target"].values



In [214]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1, train_size=0.6)

In [222]:

gda_real = GDA()
gda_real.fit(X_train, y_train)

y_hear_est = gda_real.predict(X_test)

accuracy_score(y_hear_est, y_test)

0.7786885245901639

In [223]:
#So, it indeed works