In [114]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

In [115]:
# Load data

train = pd.read_table("data/datatraining.txt", sep = ',')
train.index = pd.to_datetime(train.date)
train = train.drop('date', axis = 1)

test = pd.read_table("data/datatest.txt", sep = ",")
test.index = pd.to_datetime(test.date)
test = test.drop('date', axis = 1)

In [116]:
# Remove known anomalies from training data

train = train[train.Occupancy == 0].drop('Occupancy', axis = 1)
test = test.drop('Occupancy', axis = 1)

In [117]:
# Normalize data

norm_train = (train - train.mean()) / train.std()
norm_test = (test - train.mean()) / train.std()

In [118]:
# PCA

pca = PCA(n_components = 2, svd_solver = 'full')

pca_train = pd.DataFrame(pca.fit_transform(norm_train))
pca_train.index = norm_train.index

pca_test = pd.DataFrame(pca.transform(norm_test))
pca_test.index = norm_test.index

In [126]:
# Functions to calculate Mahalanobis Distance

def inv_cov_matrix(x):
    if np.all(np.linalg.eigvals(np.cov(x)) > 0):
        return(np.linalg.inv(np.cov(x)))

def calculate_M_distance(inv_cov_matrix, means, data, verbose=False):
    diff = data - means
    md = []
    for i in range(len(diff)):
        md.append(np.sqrt(diff[i].dot(inv_cov_matrix).dot(diff[i])))
    return md

def detect_outliers(dist, extreme=False, verbose=False):
    k = 3. if extreme else 2.
    threshold = np.mean(dist) * k
    outliers = []
    for i in range(len(dist)):
        if dist[i] >= threshold:
            outliers.append(i)
    return np.array(outliers)

def calculate_threshold(dist, extreme=False, verbose=False):
    k = 3. if extreme else 2.
    threshold = np.mean(dist) * k
    return threshold

In [127]:
# Find Mahalanobis Distance

inv_cov_train = inv_cov_matrix(np.array(pca_train.values))

In [125]:
np.array(pca_train.values).shape

(6414, 2)