In [1]:
import pandas as pd

import numpy as np

import scipy.stats as s

import matplotlib.pyplot as plt 

import seaborn as sns

In [2]:
data = pd.read_csv("mushrooms.csv")

In [3]:
for column in data.columns[1:]:
    
    data[column].replace(to_replace = list(data[column].unique()), value = list(range(0,len(data[column].unique()))), inplace = True)

In [4]:
data_array = list()

for column in data.columns[1:]:
    
    data_array.append(np.eye(len(data[column].unique()), len(data[column].unique()))[data[column]])

In [5]:
X = np.concatenate(data_array, axis=1)

X.shape

(8124, 117)

In [6]:
X_dash = X - np.mean(X, axis=0)

X_cov = (1/data.shape[0])*np.matmul(X_dash.T, X_dash)

factored_matrix = np.linalg.svd(X_cov)

Q = factored_matrix[0]

Q_tilda = Q[:,0:3]

X_new = np.matmul(X_dash, Q_tilda)

In [7]:
labels = np.array(data['class']).reshape(data['class'].shape[0],1)

new_df = pd.DataFrame(data = X_new)

new_df['class'] = labels

In [8]:
training_data_len = int(0.7*new_df.shape[0])

cv_data_len = int(0.2*new_df.shape[0])

In [9]:
poissonous_train = new_df[new_df['class'] == 'p'].iloc[0:training_data_len//2]

non_poissonous_train = new_df[new_df['class'] == 'e'].iloc[0:training_data_len//2]

training_data = pd.concat([poissonous_train, non_poissonous_train])

poissonous_remain = new_df[new_df['class'] == 'p'].iloc[training_data_len//2:]

non_poissonous_remain = new_df[new_df['class'] == 'e'].iloc[training_data_len//2:]

remaining_data = pd.concat([poissonous_remain, non_poissonous_remain])

In [10]:
remaining_data = np.array(remaining_data)

random_indices = np.random.choice(np.arange(0,2438),size=(2438,),replace=False)

remaining_data = remaining_data[random_indices]

remaining_data_labels = remaining_data[:,3].reshape(remaining_data.shape[0],1)

remaining_data = pd.DataFrame(data = remaining_data[:,0:3])

remaining_data['class'] = remaining_data_labels

In [11]:
cv_data = remaining_data[0:cv_data_len]

testing_data = remaining_data[cv_data_len:]

In [12]:
mu_hat_p = np.array(training_data[training_data['class'] == 'p'].iloc[:,0:3].mean())

sigma_hat_p = np.array(training_data[training_data['class'] == 'p'].iloc[:,0:3].cov())

np.linalg.det(sigma_hat_p)

0.37377824362795586

In [13]:
mu_hat_e = np.array(training_data[training_data['class'] == 'e'].iloc[:,0:3].mean())

sigma_hat_e = np.array(training_data[training_data['class'] == 'e'].iloc[:,0:3].cov())

np.linalg.det(sigma_hat_e)

0.012859141870228722

In [14]:
poissonous_prior = training_data[training_data['class'] == 'p'].shape[0]/training_data.shape[0]

non_poissonous_prior = training_data[training_data['class'] == 'e'].shape[0]/training_data.shape[0]

In [15]:
def cv_testing(data):
    
    inputs = np.array(data.iloc[:,0:3])
    
    posterior_p = s.multivariate_normal.pdf(inputs, mu_hat_p, sigma_hat_p) * poissonous_prior
    
    posterior_e = s.multivariate_normal.pdf(inputs, mu_hat_e, sigma_hat_e) * non_poissonous_prior
    
    boolean_mask = posterior_p > posterior_e
    
    predicted_category = pd.Series(boolean_mask)
    
    predicted_category.replace(to_replace=[False,True], value = ['e','p'], inplace =True)
    
    return np.array(predicted_category)

In [16]:
cv_results = cv_testing(cv_data)

actual_results = np.array(cv_data['class'])

cv_accuracy = np.count_nonzero(cv_results == actual_results)/actual_results.shape[0]

cv_accuracy

0.8201970443349754

In [17]:
testing_results = cv_testing(testing_data)

actual_results = np.array(testing_data['class'])

testing_accuracy = np.count_nonzero(testing_results == actual_results)/actual_results.shape[0]

testing_accuracy

0.8157248157248157