In [1]:
from os import listdir
from os.path import isfile, join
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

#### Reads in data into a dictionary where each key is the label and the value is a numpy array of files, each file has all its data that is read into a numpy array of shape (n, 3). 

In [2]:
DATASET_DIR = "HMP_Dataset"
TRAINING_SETS = np.array(['Use_telephone', 'Standup_chair', 'Walk', 'Climb_stairs', 'Sitdown_chair', 'Brush_teeth', 'Comb_hair', 'Eat_soup', 'Pour_water', 'Descend_stairs', 'Eat_meat', 'Drink_glass', 'Getup_bed', 'Liedown_bed'])

all_data = {}

print("Reading data from files...")

def initial_setup():
    for dataset in TRAINING_SETS:
        all_data[dataset] = []
        for f in listdir(join(DATASET_DIR, dataset)):
            all_data[dataset].append(np.array(np.genfromtxt(join(DATASET_DIR, dataset, f), usecols=(0, 1, 2))))
        all_data[dataset] = np.array(all_data[dataset])

print("All done!")

Reading data from files...
All done!


## Makes data into chunks

In [3]:
def blockshaped(arr, nrows, ncols):
    h, w = arr.shape
    return (arr.reshape(h//nrows, nrows, -1, ncols)
               .swapaxes(1,2)
               .reshape(-1, nrows, ncols))

## Uses 'blockshaped' to create the chunks

In [4]:
def chunkify(chunk_me):
    dif = chunk_me.shape[0]%32
    chunk_me = chunk_me[dif:,:]
    chunk_me = blockshaped(chunk_me, 32, 3)
    return chunk_me

In [5]:
def chunkify_the_data(all_data):
    chunkified_data = all_data
    for k in TRAINING_SETS:
        for i in range(len(all_data[k])):
            chunkified_data[k][i] = chunkify(all_data[k][i])
    return chunkified_data

In [6]:
def flatten_the_chunks(chunkified_data):
    flattened_data = chunkified_data
    flat_stuff = {}
    huge_matrix = []
    for k in TRAINING_SETS:
        flat_stuff[k] = []
        for j in range(len(flattened_data[k])):
            for i in range(len(flattened_data[k][j])):
                flat_stuff[k].append(flattened_data[k][j][i].reshape(96))
                huge_matrix.append(flattened_data[k][j][i].reshape(96))
        flat_stuff[k] = np.asarray(flat_stuff[k])
    huge_matrix = np.asarray(huge_matrix)
    return huge_matrix, flat_stuff

In [7]:
def flatten_for_features(chunkified_data):
    flatten_for_features = {}

    for label in TRAINING_SETS:
        flatten_for_features[label] = []
        for file in range(len(chunkified_data[label])):
            flatten_for_features[label].append([])
            for chunk in range(len(chunkified_data[label][file])):
                flatten_for_features[label][file].append(chunkified_data[label][file][chunk].reshape(96))
            flatten_for_features[label][file] = np.asarray(flatten_for_features[label][file])
        flatten_for_features[label] = np.asarray( flatten_for_features[label])
    return flatten_for_features

In [36]:
 def create_feature_dict(flat_dict): 
    vector_dict = {}
    for label in TRAINING_SETS:
        vector_dict[label] = []
        for file in range(len(flat_dict[label])):
            vector_dict[label].append([])
            temp_array = np.zeros((150,1))
            for chunk in flat_dict[label][file]:
                chunk_cluster=kmeans.predict(chunk.reshape(1, -1))
                temp_array[chunk_cluster]+=1
            vector_dict[label][file].append(temp_array)
        vector_dict[label][file] = np.asarray(vector_dict[label][file])
    return vector_dict

In [9]:
def check_correctness(flat_dict, vector_dict):
    for label in TRAINING_SETS: 
        correctnes = []
        for file in range(len(flat_dict[label])):
            if((float(flat_dict[label][file].shape[0])) == np.sum(vector_dict[label][file], axis=1)[0][0]):
                correctnes.append(True)
            else:
                correctnes.append(False)
    
    if False in correctnes:
        print("Something bad happened")
    else:
        print("All good")

In [10]:
def create_label_feature_array(file_vector, vector_dict):
    label_array = []
    feature_array = []
    for label in TRAINING_SETS:
        for file_vector in vector_dict[label]:
            label_array.append(label)
            feature_array.append(np.array(file_vector))
    label_array = label_array
    feature_array = feature_array
    return label_array, feature_array

#### Reading in the files and storing the data

In [11]:
initial_setup()

#### Using chunkify to create the chunks of data

In [12]:
chunkified_data = chunkify_the_data(all_data)
chunks_for_features = chunkified_data

#### Making the chunks flat and writing it into a dictionary with key labels and into a matrix that contains ALL of the data that was given to us

In [13]:
huge_matrix, flat_stuff = flatten_the_chunks(chunkified_data)
flat_dict = flatten_for_features(chunkified_data)

#### Running kmeans for the first time to get cluster centers

In [14]:
kmeans = KMeans(n_clusters=150, random_state=0).fit(huge_matrix)

#### Creating a dict of feature vectors for each lable

In [37]:
vector_dict = create_feature_dict(flat_dict)

#### Checking that the sum of each feature vector is the same as the amount of chunks in each file

In [16]:
check_correctness(flat_dict, vector_dict)

All good


#### Creating an array for labels and an array for each feature vector

In [18]:
labels, features = create_label_feature_array(flat_dict, vector_dict) #labels dependent, features independent
labels_numpy = np.asarray(labels)
features_numpy = np.asarray(features)

In [19]:
clf = RandomForestClassifier(n_estimators=150, max_depth=2, random_state=0)
features_numpy = features_numpy.reshape(839, 150)
clf.fit(features_numpy, labels_numpy)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

#### Creating a dataframe containing all of our feature vectors to their corresponding label

In [23]:
df = pd.DataFrame(features_numpy, labels).add_prefix('f_')
df

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_140,f_141,f_142,f_143,f_144,f_145,f_146,f_147,f_148,f_149
Use_telephone,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Use_telephone,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Use_telephone,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0
Use_telephone,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0
Use_telephone,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Use_telephone,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Use_telephone,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0
Use_telephone,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Use_telephone,0.0,0.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Use_telephone,1.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
train, test = train_test_split(df, test_size=.2, train_size=.8, shuffle=True)

In [22]:
test1 = np.array(test)
test2 = clf.predict(test)