In [49]:
import math
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold
from sklearn import cluster
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize

## Read data

In [2]:
data_dir = 'data/hw5/HMP_Dataset'
report_dir = 'reports/hw5'

pathlist = Path(data_dir).glob('*/')
acc = np.empty((0,3))
labels = np.empty((0,1))
for index, path in enumerate(pathlist):
    if(path.name.startswith('.') or not path.is_dir()):
        continue
    files = path.glob('*.txt')
    for j, file in enumerate(files):
        part = np.loadtxt(file)
        acc = np.vstack((acc, part))
        labels = np.vstack((labels, np.full((part.shape[0],1), path.name)))
print(acc.shape)
print(labels.shape)
assert labels.shape[0] == acc.shape[0]

(446529, 3)
(446529, 1)


In [66]:
# data = np.hstack((acc, labels))
# data[0]

np.unique(labels)

array(['Brush_teeth', 'Climb_stairs', 'Comb_hair', 'Descend_stairs',
       'Drink_glass', 'Eat_meat', 'Eat_soup', 'Getup_bed', 'Liedown_bed',
       'Pour_water', 'Sitdown_chair', 'Standup_chair', 'Use_telephone',
       'Walk'], dtype='<U32')

## Vector Quantization

In [4]:
def transform_to_samples(data, sample_length):
    n_slices = math.floor((data.shape[0])/sample_length)
    total_size = n_slices * sample_length
    slices = np.vsplit(np.vsplit(data, [total_size])[0], n_slices)
    reshaped_slices = map(lambda x: x.reshape(1, sample_length*3).flatten(), np.array(slices))
    transformed = np.array(list(reshaped_slices))
    assert transformed.shape == (len(slices), len(slices[0])*len(slices[0][1]))
    return transformed

In [5]:
def get_histograms_by_group(group):
    histograms = np.array(group.histogram)
    new_histograms = np.zeros((histograms.shape[0],histograms[0].shape[0]))
    
    for i, histogram in enumerate(histograms):
        for j, count in enumerate(histogram):
            new_histograms[i][j] = count
    
    return new_histograms

In [6]:
def get_mean(group):
    return get_histograms_by_group(group).mean(axis=0)

In [34]:
def get_histogram(samples, n_K, k_means):
    labels = k_means.predict(samples)
    histogram = np.histogram(labels, bins=range(n_K+1))
    return histogram

def get_histograms_and_labels(k_means, sample_length, n_clusters):
    pathlist = Path(data_dir).glob('*/')
    histograms = []
    labels = []

    for path in pathlist:
        if(path.name.startswith('.') or not path.is_dir()):
            continue
        label = path.name
        files = path.glob('*.txt')
        
        for file in files:
            raw = np.loadtxt(file)
            transformed = transform_to_samples(np.array(raw), sample_length=sample_length)
            histogram = get_histogram(transformed, n_clusters, k_means)
            histograms.append(histogram)
            labels.append(label)
    return np.array(histograms), np.array(labels)
        

In [32]:
def get_quantized_vector(sample_length, overlap, n_clusters):
    transformed = transform_to_samples(acc, sample_length=sample_length)
    k_means = cluster.KMeans(n_clusters=n_clusters)
    k_means.fit(transformed)
    historams, labels = get_histograms_and_labels(k_means, sample_length, n_clusters)
    historams.shape, labels.shape
    assert historams[:, 0][0].shape == (n_clusters,)
    df = pd.DataFrame(np.hstack((historams[:, 0].reshape(839,1), labels.reshape(839, 1))), columns=['histogram', 'label'])
    h_groups = df.groupby(['label'])
    mean_histograms = h_groups.apply(get_mean)
    histograms_by_group = h_groups.apply(get_histograms_by_group)
    return mean_histograms, histograms_by_group

In [33]:
# transformed.shape

In [35]:
def plot_mean_histogram(mean_histograms_df, n_K):
    indexes = mean_histograms_df.index
    for i, histogram in enumerate(mean_histograms_df):
        fig, ax = plt.subplots()
        ax.set_title(f'mean_histograms {indexes[i]}')
        plt.bar(range(n_K), histogram)
        plt.savefig(f'{report_dir}/{indexes[i]}.png')

# Split data

In [36]:
def split_all_histograms(histograms_by_group_df, kf, n_K):
    sets = dict.fromkeys(range(3))
    for key in list(sets.keys()):
        sets[key] = [np.empty((0, n_K)),np.empty((0, n_K)),np.empty((0, 1)), np.empty((0, 1))]

    for label, histograms in histograms_by_group_df.items():
        X = histograms
        Y = np.full((histograms.shape[0],1), label)
        split = kf.split(X)
        for index, (train_index, test_index) in enumerate(split):
            X_each_train, X_each_test = X[train_index], X[test_index]
            Y_each_train, Y_each_test = Y[train_index], Y[test_index]

            sets[index][0] = np.vstack((sets[index][0], X_each_train))
            sets[index][1] = np.vstack((sets[index][1], X_each_test))
            sets[index][2] = np.vstack((sets[index][2], Y_each_train))
            sets[index][3] = np.vstack((sets[index][3], Y_each_test))
        
    return sets

# Train Random Forest

In [67]:
def print_confustion_matrix(n_estimators,max_depth, sets):
     for i in range(3):
        [X_train, X_test, Y_train, Y_test] = sets[i]
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=10)
        clf.fit(X_train, Y_train.squeeze())
        Y_predicts = clf.predict(X_test)
        print(np.around(normalize(confusion_matrix(Y_test, Y_predicts,labels=['Brush_teeth', 'Climb_stairs', 'Comb_hair', 'Descend_stairs',
       'Drink_glass', 'Eat_meat', 'Eat_soup', 'Getup_bed', 'Liedown_bed',
       'Pour_water', 'Sitdown_chair', 'Standup_chair', 'Use_telephone',
       'Walk'])), decimals=2))
             

In [56]:
def evaluate(sets, get_confustion_matrix, n_estimators=200, max_depth=100):
    if(get_confustion_matrix):
        print_confustion_matrix(n_estimators, max_depth,sets)
        return
    scores = []
    for i in range(3):
        [X_train, X_test, Y_train, Y_test] = sets[i]
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=10)
        clf.fit(X_train, Y_train.squeeze())
        score = clf.score(X_test, Y_test.squeeze())
        scores.append(score)
        print(f'itr: {i}, score: {score}')
    mean_accuracy = np.array(scores).mean()
    print(f'average score: {mean_accuracy}')
    return mean_accuracy

# Experiment entry point

In [53]:
def experiment(sample_length, overlap, n_clusters, plot_fig=False, get_confustion_matrix=False):
    # Vector quantization
    mean_histograms, histograms_by_group = get_quantized_vector(sample_length, overlap, n_clusters)
    # Plot mean histogram
    if(plot_fig):
        plot_mean_histogram(mean_histograms, n_clusters)
    kf = KFold(n_splits=3, shuffle=True)
    # Split data
    sets = split_all_histograms(histograms_by_group, kf, n_clusters)
    # assertion
    [X_train, X_test, Y_train, Y_test] = sets[0]
    assert X_train.shape[0] + X_test.shape[0] == 839
    # Evaluate
    accracy = evaluate(sets, get_confustion_matrix)
    print(f'sample_length: {sample_length}, overlap: {overlap}, n_clusters: {n_clusters} =====> accracy: {accracy}')
    return accracy

In [43]:
%%time
accracy = experiment(sample_length=32, overlap=0, n_clusters=480)

itr: 0, score: 0.7263157894736842
itr: 1, score: 0.7266187050359713
itr: 2, score: 0.7681159420289855
average score: 0.7403501455128803
sample_length: 32, overlap: 0, n_clusters: 480 =====> accracy: 0.7403501455128803
CPU times: user 1min 18s, sys: 8.82 s, total: 1min 27s
Wall time: 46.1 s


In [44]:
%%time
accracy = experiment(sample_length=32, overlap=0, n_clusters=240)

itr: 0, score: 0.7157894736842105
itr: 1, score: 0.7805755395683454
itr: 2, score: 0.7282608695652174
average score: 0.7415419609392577
sample_length: 32, overlap: 0, n_clusters: 240 =====> accracy: 0.7415419609392577
CPU times: user 50.2 s, sys: 6.08 s, total: 56.3 s
Wall time: 30.4 s


In [45]:
%%time
accracy = experiment(sample_length=16, overlap=0, n_clusters=480)

itr: 0, score: 0.7578947368421053
itr: 1, score: 0.7230215827338129
itr: 2, score: 0.8043478260869565
average score: 0.7617547152209583
sample_length: 16, overlap: 0, n_clusters: 480 =====> accracy: 0.7617547152209583
CPU times: user 2min 36s, sys: 20.9 s, total: 2min 57s
Wall time: 1min 38s


In [68]:
%%time
accracy = experiment(sample_length=16, overlap=0, n_clusters=240, get_confustion_matrix=True)

[[0.95 0.   0.   0.   0.   0.   0.   0.   0.   0.32 0.   0.   0.   0.  ]
 [0.   1.   0.   0.   0.06 0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.37 0.   0.93 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   1.   0.   0.   0.   0.   0.1  0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.96 0.   0.   0.08 0.27 0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.57 0.   0.19 0.57 0.57 0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.04 0.98 0.18 0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.18 0.   0.04 0.35 0.92 0.   0.  ]
 [0.   0.   0.   0.   0.83 0.   0.   0.   0.   0.55 0.   0.   0.   0.  ]
 [0.   0.18 0.   0.04 0.   0.   0.   0.   0.   0.  