In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.datasets import load_digits
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [4]:
digits = load_digits()
data = pd.DataFrame(digits.data)
label = 'class'
data[label] = digits.target

X = data.drop(label,axis=1)
y = data[label]

In [6]:
y.head(5)

0    0
1    1
2    2
3    3
4    4
Name: class, dtype: int32

In [None]:
def fit_transform(data):
    label = 'class'
    num_dims = 2
    
    # get the groups
    grouped = data.groupby(data.loc[:,label])
    classes = [k for k in grouped.groups.keys()]

    # mean for each class:
    data_class = {}
    X = {}
    means = {}
    for k in classes:
        data_class[k] = grouped.get_group(k)
        X[k] = data_class[k].drop(label,axis=1)
        means[k] = np.array(np.mean(X[k]))

    # mean of the total data
    mean_total = np.array(np.mean(data.drop(label,axis=1)))

    # between covariance matrix
    S_B = np.zeros((X[0].shape[1], X[0].shape[1]))
    for k in X.keys():
        S_B += np.multiply(len(X[k]),
                            np.outer((means[k] - mean_total),
                            np.transpose(means[k] - mean_total)))

    # within covariance matrix
    S_W = np.zeros((X[0].shape[1], X[0].shape[1]))
    for k in classes:
        S_k = X[k] - means[k]
        S_W += np.dot(S_k.T, S_k)

    # eigendecomposition
    S = np.dot(np.linalg.pinv(S_W), S_B)
    eigval, eigvec = np.linalg.eig(S)

    # sort eigenvalues in decreasing order
    eig = [(eigval[i], eigvec[:,i]) for i in range(len(eigval))]
    eig = sorted(eig, key=lambda x: x[0], reverse=True)

    # only take the top (number of dimensions projected into) vectors
    w = np.array([eig[i][1] for i in range(num_dims)])
    
    new_data = {}
    for k in X.keys():
        new_data[k] = np.dot(X[k], w.T)
    
    return new_data

In [None]:
new_data=fit_transform(data)

In [None]:
new_data

In [None]:
new_data.keys()

### Cross-validation

In [9]:
# Split the data into k folds
def cross_val_split(data, folds, label, index):
    data_idx = []
    indices = [i for i in range(data.shape[0])]
    
    fold_size = int(len(data)/folds)
    for i in range(folds):
        fold_idx = []
        while len(fold_idx) < fold_size:
            idx = random.randrange(len(indices))
            fold_idx.append(indices.pop(idx))
        data_idx.append(fold_idx)
    
    test_idx = data_idx[index]
    del data_idx[index]
    train_idx = [item for sublist in data_idx for item in sublist]
    
    test = data.iloc[test_idx]
    train = data.iloc[train_idx]
    
    return train, test

train, test = cross_val_split(data, folds=10, label=label, index=0)

In [10]:
train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,class
1238,0.0,1.0,13.0,16.0,16.0,12.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,15.0,3.0,0.0,0.0,0.0,0.0,7
1334,0.0,0.0,0.0,14.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,14.0,16.0,4.0,0.0,0.0,1
1742,0.0,4.0,15.0,15.0,8.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,7.0,14.0,11.0,0.0,0.0,0.0,0.0,2
1084,0.0,0.0,9.0,16.0,14.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,12.0,14.0,13.0,16.0,16.0,5.0,2
597,0.0,0.0,6.0,14.0,16.0,13.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10.0,13.0,0.0,0.0,0.0,0.0,7


In [12]:
X_train = train.drop(label,axis=1)
y_train = train[label]
X_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
1238,0.0,1.0,13.0,16.0,16.0,12.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,15.0,3.0,0.0,0.0,0.0,0.0
1334,0.0,0.0,0.0,14.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,14.0,16.0,4.0,0.0,0.0
1742,0.0,4.0,15.0,15.0,8.0,0.0,0.0,0.0,0.0,8.0,...,3.0,0.0,0.0,7.0,14.0,11.0,0.0,0.0,0.0,0.0
1084,0.0,0.0,9.0,16.0,14.0,0.0,0.0,0.0,0.0,0.0,...,9.0,1.0,0.0,0.0,12.0,14.0,13.0,16.0,16.0,5.0
597,0.0,0.0,6.0,14.0,16.0,13.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,13.0,0.0,0.0,0.0,0.0


In [15]:
X_test = test.drop(label,axis=1)
y_test = test[label]

In [16]:
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X_train,y_train)
acc = lda.score(X_test, y_test)
acc



0.9776536312849162

In [21]:
sample = data.head(15)
y = sample[label]
X = sample.drop(label, axis=1)
lda.fit_transform(X,y)

ZeroDivisionError: float division by zero

### Gaussian modeling

In [None]:
priors = g_means = g_cov = {}
for k in X.keys():
    priors[k] = X[k].shape[0] / sample.shape[0]
    g_means[k] = np.mean(X[k])
    g_cov[k] = np.cov(X[k],rowvar=False)
print(g_cov[0].shape)