In [114]:
# Imports
import pandas as pd
import numpy as np

from warnings import filterwarnings
from sklearn.metrics import accuracy_score
from math import log

# Disable warnings from being printed
filterwarnings('ignore')

In [None]:
fileN = 800
fileM = 100000

def read_data(filename):
    data = pd.DataFrame(columns=range(fileM))
    with open(filename, 'r') as datafile:
        lines = datafile.readlines()
        for i in range(len(lines)):
            record = np.fromstring(lines[i], dtype=int, sep=' ')
            record_bool = [0 for i in range(fileM)]
            for col in record:
                record_bool[col-1] = 1
            data.loc[i] = record_bool
    return data

def read_labels(filename):
    labels = []
    with open(filename, 'r') as datafile:
        lines = datafile.readlines()
        for line in lines:
            labels.append(np.fromstring(line, dtype=int, sep=' ')[0])
    return labels

# Read the data into dataframe
train_data = read_data("dorothea/dorothea_train.data")
valid_data = read_data("dorothea/dorothea_valid.data")

# Get the labels of the train data
train_data_labels = read_labels("dorothea/dorothea_train.labels")
valid_data_labels = read_labels("dorothea/dorothea_valid.labels")

In [9]:
# Compute data which is constant in different runs of pca, i.e. eigenvectors

def compute_eigenvectors(data):
    
    # Center the data around mean
    data_centered = data - np.mean(data, axis=0)

    # Compute the covariance matrix (xx' i.e nXn), and find eigenvalues and eigenvectors
    cov_matrix = data_centered.transpose().cov()
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

    # Now eigenvectors of x'x matrix can be obtained from these by multiplying by x', eigenvalues remain same
    eigenvectors = np.dot(np.transpose(data_centered), eigenvectors)

    # Sort the eigenvectors in decreasing order of eigenvalues
    sort_order = np.argsort(eigenvalues)[::-1]
    new_eigenvectors = np.zeros(eigenvectors.shape)
    for i in range(eigenvalues.shape[0]):
        new_eigenvectors[:, i] = eigenvectors[:, sort_order[i]]
        
    return new_eigenvectors

In [10]:
# Get data in the new feature space of reduced dimensionality.
def pca_(data, new_eigenvectors, k):
    
    # Get first K eigenvectors
    eigenpairs_firstK = new_eigenvectors[:, :k]
    
    # Get data in reduced dimension space
    projected_data = np.dot(data, eigenpairs_firstK)
    
    return pd.DataFrame(projected_data)

In [16]:
def GNBC_pca(train, valid):
    
    # Separate the classes
    class_m = train[train["labels"] == -1]
    class_p = train[train["labels"] == 1]
    
    # Calculate prior probabilities for both classes
    prior_m = class_m.shape[0]/train.shape[0]
    prior_p = class_p.shape[0]/train.shape[0]
    
    # Calculate variances for all features
    var_m = np.var(class_m, axis=0)
    var_p = np.var(class_p, axis=0)
    
    # Calculate mean for all features
    mean_m = np.mean(class_m, axis=0)
    mean_p = np.mean(class_p, axis=0)

    # Predict
    results = []

    for i in range(valid.shape[0]):

        posterior_m = log(prior_m)
        posterior_p = log(prior_p)
        
        for j in range(valid.shape[1]-1):
            cur_x = valid.loc[i, j]
            posterior_m = posterior_m + (-0.5 * (((cur_x - mean_m[j])**2) / var_m[j])) - 0.5*log(var_m[j])
            posterior_p = posterior_p + (-0.5 * (((cur_x - mean_p[j])**2) / var_p[j])) - 0.5*log(var_p[j])

        if posterior_m >= posterior_p:
            cur_class = -1
        else:
            cur_class = 1
    
        results.append(cur_class)
        
    # Calculate accuracy
    return accuracy_score(valid["labels"], results)    

In [17]:
def iterate():
    
    accuracies = []
    
    kl = [100, 500, 800]
    
    new_eigenvectors_train = compute_eigenvectors(train_data)
    new_eigenvectors_valid = compute_eigenvectors(valid_data)
    
    for k in kl:
        projected_train = pca_(train_data, new_eigenvectors_train, k)
        projected_valid = pca_(valid_data, new_eigenvectors_valid, k)
    
        projected_train["labels"] = train_data_labels
        projected_valid["labels"] = valid_data_labels

        cur_accuracy = GNBC_pca(projected_train, projected_valid)
        accuracies.append(cur_accuracy)
        
    print("Statistics")
    print(100, accuracies[0])
    print(500, accuracies[1])
    print(800, accuracies[2])
    
iterate()

Statistics
100 0.902857142857
500 0.902857142857
800 0.902857142857


In [95]:
# Get projected data as input for LDA
new_eigenvectors_train = compute_eigenvectors(train_data)
new_eigenvectors_valid = compute_eigenvectors(valid_data)
projected_train = pca_(train_data, new_eigenvectors_train, 800)
projected_valid = pca_(valid_data, new_eigenvectors_valid, 800)

projected_train["labels"] = train_data_labels
projected_valid["labels"] = valid_data_labels

In [105]:
def lda_(data):
    # Separate the train data classwise.
    class_m = data[data["labels"] == -1]
    class_p = data[data["labels"] == 1]

    # Drop the last labels column for matrix calculations
    class_m = class_m.drop("labels", axis=1)
    class_p = class_p.drop("labels", axis=1)

    # Get scatter matrices for each class separately
    scatter_m = np.cov(np.transpose(class_m))
    scatter_p = np.cov(np.transpose(class_p))

    # Compute means for each feature.
    mean_m = np.mean(class_m, axis=0)
    mean_p = np.mean(class_p, axis=0)
    mean_t = np.mean(data, axis=0)
    mean_t = mean_t.drop("labels")

    # Compute with class and between class scatter matrices
    sw = scatter_m + scatter_p
    swin = np.linalg.inv(sw)
    wstar = np.dot(swin, (mean_m - mean_p))

    # Find new projected data
    new_projected_data = data.drop("labels", axis=1)
    new_projected_data = np.dot(np.transpose(wstar), new_projected_data)
    return pd.DataFrame(new_projected_data)

new_projected_train = lda_(projected_train)
new_projected_valid = lda_(projected_valid)
new_projected_train["labels"] = train_data_labels
new_projected_valid["labels"] = valid_data_labels

In [113]:
def GNBC_lda(train, valid):
    
    # Separate the classes
    class_m = train[train["labels"] == -1]
    class_p = train[train["labels"] == 1]
    
    # Calculate prior probabilities for both classes
    prior_m = class_m.shape[0]/train.shape[0]
    prior_p = class_p.shape[0]/train.shape[0]
    
    # Calculate variances for all features
    var_m = np.var(class_m, axis=0)
    var_p = np.var(class_p, axis=0)
    
    # Calculate mean for all features
    mean_m = np.mean(class_m, axis=0)
    mean_p = np.mean(class_p, axis=0)
    
    print(var_p)

    # Predict
    results = []

    for i in range(valid.shape[0]):

        posterior_m = log(prior_m)
        posterior_p = log(prior_p)
        
        for j in range(valid.shape[1]-1):
            cur_x = valid.loc[i, j]
            print(cur_x, mean_m[j])
            posterior_m = posterior_m + (-0.5 * (((cur_x - mean_m[j])**2) / var_m[j])) - 0.5*log(var_m[j])
            posterior_p = posterior_p + (-0.5 * (((cur_x - mean_p[j])**2) / var_p[j])) - 0.5*log(var_p[j])

        if posterior_m >= posterior_p:
            cur_class = -1
        else:
            cur_class = 1
    
        results.append(cur_class)
        
    # Calculate accuracy
    return accuracy_score(valid["labels"], results)

accuracy = GNBC_lda(new_projected_train, new_projected_valid)
print("Accuracy: ", accuracy)

0         2.253114e+59
labels    0.000000e+00
dtype: float64
5.35041890907e+29 1.05847006076e+28
2.77438949099e+29 1.05847006076e+28
2.14738821798e+29 1.05847006076e+28
5.44390150931e+29 1.05847006076e+28
1.42895198621e+29 1.05847006076e+28
-5.95505728653e+29 1.05847006076e+28
-2.44407115433e+29 1.05847006076e+28
4.10647349314e+29 1.05847006076e+28
2.07377563276e+29 1.05847006076e+28
3.48634019013e+29 1.05847006076e+28
1.94789674073e+29 1.05847006076e+28
-1.21533773718e+29 1.05847006076e+28
3.10306847805e+29 1.05847006076e+28
-4.22370542486e+29 1.05847006076e+28
5.4686218152e+29 1.05847006076e+28
5.68087476221e+28 1.05847006076e+28
-3.35393290736e+29 1.05847006076e+28
-6.48816231497e+29 1.05847006076e+28
-6.33127908429e+29 1.05847006076e+28
4.98166256151e+29 1.05847006076e+28
-7.84172191623e+27 1.05847006076e+28
2.31096538985e+29 1.05847006076e+28
-7.84088843057e+29 1.05847006076e+28
1.45201579455e+29 1.05847006076e+28
5.43792512116e+29 1.05847006076e+28
-7.12122974438e+29 1.0584700607