In [267]:
# Loading all of the data files, all at 100mhz or 100 samples per second
# Thanks to Katie for her files, I needed the sitting, jogging and stairs ones,
# the others are my own

import numpy as np
import csv
import itertools

clsTrain = [] # this is store my array of classifiers (0-4) - kinda hacky but it works
clsTest = []

#This takes in the csv files and returns an array that can be used for machine learning
#also returns the header row which is the same for all of my files
def load_files(input_file, cls_num):
    output_array = []
    input_headers = []
    with open(input_file, 'rb') as f:
        reader = csv.reader(f)
        input_headers = reader.next() #saves the headers
        for row in reader:
            output_array.append(row)
            
    # Prior testing gave me the size of the files so I know I can skip the first 10 seconds (1000 rows)
    # and then take the next 30,000 records to get 30 - 10 second chunks
    # never forget that it's all zero indexed
    output_array = output_array[999:30999]
    
    # Now let's reduce down to 10 second chunks aka 1000 rows
    compressed = []
    n = 0
    temp = []
    
    for row in output_array:
        temp.append(row)
        n +=1
        if n == 1000: 
            temp = np.array(temp).astype(float) #convert to a numpy float array so I can do the next line
            compressed.append(temp.mean(axis=0).tolist()) #this averages by columns and makes it a list of lists
            temp = []
            n = 0
            
    # I use this to build the 'y' array for use later
    for i in range(30):
        if i < 24:
            clsTrain.append(cls_num)
        else:
            clsTest.append(cls_num)
            
#     compressed = list(itertools.chain.from_iterable(compressed)) # this flatens the list - which turns to not be wanted

    return compressed, input_headers
    
rawWalkValues, Labels = load_files("walking2.csv",0)
rawSittingValues, Labels = load_files("sitting.csv",1)
rawCarValues, Labels = load_files("car.csv",2)
rawJogValues, Labels = load_files("Jogging.csv",3)
rawStairsValues, Labels = load_files("Steps.csv",4)

# put everything together into one merged training and then seperate test sets
# first 24 records from each  train, last 6 = testing
allTrain = rawWalkValues[:24] + rawSittingValues[:24] + rawCarValues[:24] + rawJogValues[:24] + rawStairsValues[:24]
walkTest = rawWalkValues[24:] 
sitTest = rawSittingValues[24:]
carTest = rawCarValues[24:]
jogTest = rawJogValues[24:]
stairsTest = rawStairsValues[24:]
# allTest = rawWalkValues[24:] + rawSittingValues[24:] + rawCarValues[24:] + rawJogValues[24:] + rawStairsValues[24:]


In [249]:
# This block is just to test my data loads and various setup stuff from the block above
# print Labels
# print rawWalkValues[0]
# print clsTrain
# print len(rawWalkValues[:24])

In [185]:
#Testing the length of the arrays, I should have 30 records after chunking into 10 second sections by / 1000
# print "walk -",len(rawWalkValues)
# print "sit -",len(rawSittingValues)
# print "car -",len(rawCarValues)
# print "jog -",len(rawJogValues)
# print "stairs -",len(rawStairsValues)


In [213]:
# trying some feature selection, not a fan of this method, see comments below
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8*(1-.8)))
# print sel.fit_transform(rawWalkValues)[0]

# So this strips out the low variance stuff but doesn't help me identify the right features, just removes certain columns
# I can do (and did) a manual compare but that just seems wrong for some reason

In [126]:
# another attempt at feature selection - which doesn't work for negative values, boo
# from sklearn.datasets import load_iris
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# iris = load_iris()
# X, y = rawWalkValues, rawLabels

# X = np.array(X)
# X.shape

# X_new = SelectKBest(chi2, k=2).fit_transform(X, rawLabels)
# X_new.shape

# This doesn't work for negative values which is what I have - so this selection is out

In [259]:
# Yet another attempt at feature selection
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html for details

from sklearn.feature_selection import RFE
from sklearn.svm import SVR

estimator = SVR(kernel="linear")
selector = RFE(estimator,2,step=1) # select the n most informative features
selector = selector.fit(allTrain, clsTrain)

print(selector.support_)
print(selector.ranking_)
print "Two Most Important Features are:"
rank = selector.ranking_
for i in range(len(rank)):
    if rank[i] == 1:
        print "   -- ",Labels[i]
        

selector = RFE(estimator,1,step=1) # select the n most informative features
selector = selector.fit(allTrain, clsTrain)
print "The Most Important Feature is:"
rank = selector.ranking_
for i in range(len(rank)):
    if rank[i] == 1:
        print "   -- ",Labels[i]


[False False False False False False False False False False  True False
  True False False False]
[15  5  8 11  9 10  7  4  6  2  1  3  1 14 13 12]
Two Most Important Features are:
   --  user_acc_x
   --  user_acc_z
The Most Important Feature is:
   --  user_acc_x


In [256]:
rank = selector.ranking_

for i in range(len(rank)):
    if rank[i] == 1:
        print Labels[i]

user_acc_x
user_acc_z


In [252]:
#Let's do some feature selection - which isn't the same as extraction
#http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#example-ensemble-plot-forest-importances-py

import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
                           n_features=10,
                           n_informative=3,
                           n_redundant=0,
                           n_repeated=0,
                           n_classes=2,
                           random_state=0,
                           shuffle=False)

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

# forest.fit(X, y)
forest.fit(allTrain, clsTrain)

importances = forest.feature_importances_ #array with importances of each feature

idx = np.arange(0, X.shape[1]) #create an index array, with the number of features

features_to_keep = idx[importances > np.mean(importances)] #only keep features whose importance is greater than the mean importance
#should be about an array of size 3 (about)
print features_to_keep.shape

x_feature_selected = X[:,features_to_keep] #pull X values corresponding to the most important features

# print x_feature_selected
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

IndexError: index 14 is out of bounds for axis 1 with size 10

In [None]:
# turn the raw data into a more compressed format and split into training and testing arrays
# based on all my files being a little longer than needed I'm going to drop the first 10 seconds (1000 rows)
# # take the next 24 (10 second chunks or 24000 rows) as training, and then the next 6 (6000 rows) as testing
# compressed = []

# n = 0
# temp = 0

# for row in rawWalkValues:
#     temp += float(row[7]) # TODO:  What column should I use? and would it be different for different files
#     n +=1
#     if n == 1000:
#         compressed.append(temp/n)
#         temp = 0
#         n = 0
    
# #splitting it up into two sets
# train = []
# test = []

# i = 0
# for i in range(len(compressed)):
#     if i < 24:
#         train.append(compressed[1])
#     elif i < 30:
#         test.append(compressed[1])
    



In [276]:
# Where the Decision Tree Learning Happens
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

clf = tree.DecisionTreeClassifier()
  
clf = tree.DecisionTreeClassifier()
clf = clf.fit(allTrain, clsTrain)

clf.predict(stairsTest)
# clf.predict_proba(walkTest)
# clf.predict_proba(sitTest)
# clf.predict_proba(carTest)
# clf.predict_proba(jogTest)
# clf.predict_proba(stairsTest)


array([4, 4, 4, 4, 4, 4])