In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.datasets import load_digits
from sklearn.datasets import load_boston

In [None]:
def to_label(data, target, percentile):
    frac = percentile / 100.0
    part_val = data[target].quantile(frac)
    data[target] = [1 if d > part_val else 0 for d in data[target]]
    return data

In [None]:
boston = load_boston()
data = pd.DataFrame(boston.data, columns=boston.feature_names)
label = 'HomeVal50'
data[label] = boston.target
to_label(data, label, 50)
data.head(5)

In [None]:
def train_test_split(data, label, test_ratio=0.2):

    test_idx = []
    indices = [i for i in range(data.shape[0])]

    test_size = test_ratio * len(data)
    while len(test_idx) < test_size:
        test_idx.append(random.randrange(len(indices)))

    train_idx = [i for i in indices if i not in test_idx]

    test = data.iloc[test_idx]
    train = data.iloc[train_idx]
    
    y_train = train[label]
    X_train = train.drop(label,axis=1)

    y_test = test[label]
    X_test = test.drop(label,axis=1)

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = train_test_split(data, label=label)

In [None]:
N = X_train.shape[0]
D = X_train.shape[1]
print(N,D)

In [None]:
len(y_train)

In [None]:
classes = np.unique(y_train)
classes

In [None]:
t_one_hot = np.zeros((N,len(classes)))
t_one_hot[np.arange(N), y_train] = 1
t_one_hot.shape

In [None]:
prior = [np.sum(y_train == c) / len(y_train) for c in classes]

In [None]:
prior

In [None]:
def to_dict( X, y):
    return {k: X_train.iloc[np.where(y_train==k)[0]] for k in classes}

In [None]:
X = to_dict(X_train, y_train)
X[0].shape

In [None]:
def gaussian_params(X_train, y_train):
    X_dict = to_dict(X_train, y_train)
    g_prior = {k: X_dict[k].shape[0]/X_train.shape[0] for k in classes}
    g_mean = {k: np.mean(X_dict[k], axis=0) for k in classes}
    g_sd = {k: np.std(X_dict[k],axis=0, ddof=1) for k in classes}
    return g_prior, g_mean, g_sd

In [None]:
g_prior, g_mean, g_sd = gaussian_params(X_train, y_train)

In [None]:
g_prior

In [None]:
{k: np.var(X[k],axis=0) for k in classes}[0]

In [None]:
from scipy.stats import norm
norm.logpdf(x, g_mean[0][1],g_cov[0][1])

In [None]:
def gaussian_probability(x, mean, std):
    exp = np.exp(- ((x-mean)**2) / (2*(std**2)) )
    return (1 / (np.sqrt(2*np.pi) * std)) * exp

In [None]:
p1 = gaussian_probability(X[0].iloc[1], g_mean[0], g_sd[0])
p2 = gaussian_probability(X[0].iloc[2], g_mean[0], g_sd[0])

In [None]:
p1

In [None]:
def gaussian_pdf(x, mean, std):
    A = 1/((2*np.pi)**(len(x)/2))
    B = 1/(np.prod(std))
    C = - np.sum(((x - mean)**2) / (2 * (std**2)))
    return A*B*np.exp(C)

In [None]:
x = X_train.iloc[1]
mean = g_mean[0]
std = g_sd[0]
gaussian_pdf(x, mean, std)

In [None]:
g_mean[0]

In [None]:
g_mean[1]

In [137]:
def predict(X):
    y_pred = []
    # for each data point
    for idx, x in X.iterrows():
        likelihood = []
        # likelihood for each class
        for k in classes:
            p = g_prior[k] * gaussian_pdf(x, g_mean[k], g_sd[k])
            likelihood.append(np.log(p+1e-6))
        # predict label (one with highest likelihood)
        y_pred.append(np.argmax(likelihood))

    return y_pred

In [141]:
y_pred = predict(X_train)

In [143]:
np.sum(y_pred != y_train) / float(len(y_train))

0.22115384615384615