# Clean up nn.ipynb code

## Reader class

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import label_binarize
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.optimizers import SGD
from keras.optimizers import Adam
%matplotlib inline
import time

class Reader:
    """
    Class that serves to read arff file(s) in the format given in the WISDM dataset.
    Returns the data in a labelled pandas dataframe form.
    """
    
    def __init__(self, path, mode='f'):
        """
        Upon instantiating the class, all methods will be run in the correct order and the dataframe
        will be stored in the df attribute.
        If mode='f', the reader will read a single arff file given a file path.
        If mode='d', the reader will read all of the arff files in the given directory path.
        """
        self.df = self.wrapper(path,mode)
    
    def readarff(self, filename,collect=True): #collect if you need to collect attribute names
        """
        This method reads an arff file line-by-line and puts that file's data into a dataframe, and returns
        that dataframe. 
        If collect=True, it will also collect column ("attribute") names and return those as well.
        """
        with open(filename) as f:
            content = f.read().splitlines()
        data = False
        metalist = [] # storets metadata as list of rows
        datalist = [] # store data as list of rows
        
        # read data line-by-line
        for line in content:
            if data == True:
                line = line.split(",")
                datalist.append(line)
            elif line == "@data":
                data = True # read lines before "@data" as metadata and after as data
            else:
              # clean up metadata header
              if collect:
                line = line.replace(' "', ".")
                line = line.replace('" ', ".")
                line = line.replace(" ","")
                line = line.split(".")
                if len(line)==3: #ignore first two lines of file
                    line = line[1:3] #remove repetitive "@attribute"
                    metalist.append(line)
        
        # create dataframes from lists of rows
        if not collect:
            dataframe = pd.DataFrame(datalist,dtype=float)
            return dataframe
        else:
            dataframe = pd.DataFrame(datalist,dtype=float)
            metaframe = pd.DataFrame(metalist,columns=["attribute","description"])
            attributes = metaframe["attribute"].rename("SAMPLE")
            return dataframe, attributes
    
    def readdirectory(self, path,quiet=False): # make sure path ends in a slash
        """
        Given a directory path, this method will call readarff() on all arff files in that directory.
        It will then combine the data from all of those files into one dataframe, and return that dataframe.
        """
        alldata = []
        count = 0
        for filename in os.listdir(path):
            if filename.endswith(".arff"):
                if count == 0: #only collect attributes once
                    if not quiet:
                          print("processing "+filename+"; collecting attribute names")
                    dataframe, attributes = self.readarff(path+filename)
                    alldata.append(dataframe)
                else:
                    if not quiet:
                        print("processing "+filename)
                    dataframe = self.readarff(path+filename,collect=False)
                    alldata.append(dataframe)
                count += 1
                continue
            else:
                continue
        if not quiet:
            print("Concatenating data")
        alldata = pd.concat(alldata).reset_index(drop=True) #reset indices so it is continuous
        alldata.columns = attributes #assign column names
        return alldata
    
    def wrapper(self, path, mode='f'):
        """ 
        This method decides whether to call readarff() or readdirectory() based on the value of 'mode'.
        It also handles any errors that occur.
        """
        if mode == 'f':
            try:
                df =  self.readarff(path, collect = True)
                return df
            except:
                print("make sure you inputted the correct arff FILE path")
        elif mode == 'd':
            try:
                df =  self.readdirectory(path, quiet=True)
                return df
            except:
                print("make sure you inputted the correct arff DIRECTORY path, ending with a slash")
        else:
            print("mode must either be 'f' or 'd'")
            return 0

## Model class

In [2]:
class Model():
    
    def __init__(self, df):
        """
        Accepts inputs and runs necessary methods to initiate the model.
        Model contains the following attributes:
        .df = pandas dataframe of attribute data
        .X = independent data
        .Y = dependent data, one hot encoded
        .X_test, .X_train = split independent data
        .Y_test, .Y_train = split dependent data
        .model = Keras model format, outlining the layers of the model. To be used with KerasClassifier.
        .estimator = Instance of KerasClassifier using .model, 500 epochs, 200 batch size
        .results = List of k accuracy results from k-fold validation, if applicable
        .history = estimator.fit() history from train/test validation, if applicable
        .kfold = boolean for whether or not kfold is used, as opposed to a single train/test split
        """
        self.df = df
    
    def preprocess(self, attributes, class_label, testsize):
        """
        attributes = list of desired attribute names
        class_label = string of the desired class label
        """
        # define self.X
        X = self.df[attributes]
        self.X = np.array(X)
        
        # define self.Y
        Y = self.df[class_label]
        self.encoder = LabelEncoder() # encoder stores conversion between class values (str) and identifiers (int)
        self. encoder.fit(Y)
        Y = self.encoder.transform(Y)
        self.Y = pd.get_dummies(Y) # convert to one hot encoded form
        
        # split into train and test sets
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.X, self.Y, test_size = testsize, stratify=self.Y, random_state=1)
    
    def baseline_model(self):
        """
        Defines Keras model.
        """
        model = Sequential()
        
        # define hyperparameter values
        p_input = 0 # fraction of the inputs to dropout
        p_hidden = 0.1 # fraction of the hidden layer nodes to dropout
        
        # input layer with dim = number of input attributes. Dropout applied.
        model.add(Dense(self.X.shape[1], input_dim=self.X.shape[1], activation='relu'))
        model.add(Dropout(p_input))
        
        # hidden layers with dim = 256, 128, and 64 (arbitrary). Each layer has dropout applied.
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(p_hidden)) # dropout applied to first hidden layer
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(p_hidden)) # dropout applied to second hidden layer
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(p_hidden)) # dropout applied to third hidden layer.
        
        # output layer with dim = number of output classes, using softmax activation.
        model.add(Dense(self.Y.shape[1], activation = 'softmax'))

        # compile model with categorical_crossentropy loss and adam optimizer.
        model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
        self.model = model
        return model
    
    def train(self, kfold=True):
        """ 
        Method to train the model and analyze results.
        TODO: save model attributes somehow so that 
        """
        self.kfold = kfold
        start_time = time.time()
        self.estimator = KerasClassifier(build_fn=self.baseline_model, epochs=500, batch_size=200, verbose=0)
        
        if kfold:
            # run k-fold validation, which is slow
            # TODO use kfold.split() to get seperate test/train sets, then you can run estimator.fit()
            # on those in the same way we use it below. That would allow us to plot history.
            # Example: https://www.machinecurve.com/index.php/2020/02/18/how-to-use-k-fold-cross-validation-with-keras/
            print("Using K-fold validation method to train model and calculate score")
            kfold = KFold(n_splits=4, shuffle=True)
            results = cross_val_score(self.estimator, self.X, self.Y, cv=kfold)
            print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
            print("%s seconds" % (time.time() - start_time))
            self.results = results
        
        else:
            # run train/test split, which is fast
            print("Using training data to train model and testing data to calculate score")
            self.history = self.estimator.fit(self.X_train, self.Y_train, verbose = 0)
            self.plot_history(self.history)
            print("Test score: "+str(self.estimator.score(self.X_test, self.Y_test)))
            print("%s seconds" % (time.time() - start_time))
    
    def evaluate(self):
        """
        Method to look at accuracies on a class-basis using model created during train() and 
        accuracies collected from using test data.
        """
        if not self.kfold:
            classes = {"A":"Walking","B":"Jogging","C":"Stairs","D":"Sitting",
               "E":"Standing","F":"Typing","G":"Brushing teeth","H":"Eating soup",
               "I":"Eating chips","J":"Eating pasta","K":"Drinking from a cup",
               "L":"Eating sandwich","M":"Kicking (soccer ball)","O":"Playing catch (with tennis ball)",
               "P":"Dribbling (basketball)", "Q":"Writing","R":"Clapping","S":"Folding clothes"}
            print("Model prediction accuracies based on phone acceleration data:")
            print("="*61)
            for i in self.encoder.classes_: # iterate through classes (string letters)
                j = self.encoder.transform([i])[0] # use indexer because .transform returns a list instead of an integer ex: [1] instead of 1
                idx = self.Y_test[j] # create reference array to only choose samples from the i'th (aka j'th) class
                X1 = self.X_test[idx==1]
                Y1 = self.Y_test[idx==1]
                print(classes[i], "test score:","-"*(40-len(classes[i])),str(self.estimator.score(X1, Y1)))
        else:
            print("Model evaluation is not supported for kfold-trained data. Try using .train(kfold=False) first.")
            
    def plot_history(self, history):
        """ 
        Method copied from https://www.kaggle.com/danbrice/keras-plot-history-full-report-and-grid-search
        Displays evolution of loss and accuracy during training, given the history
        """
        loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
        val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
        acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' not in s]
        val_acc_list = [s for s in history.history.keys() if 'acc' in s and 'val' in s]

        epochs = range(1,len(history.history[loss_list[0]]) + 1) # x values on graph

        ## Loss
        plt.figure(1)
        for l in loss_list:
            plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
        for l in val_loss_list:
            plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))

        plt.title('Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()

        ## Accuracy
        plt.figure(2)
        for l in acc_list:
            plt.plot(epochs, history.history[l], 'b', label='Training accuracy (' + str(format(history.history[l][-1],'.5f'))+')')
        for l in val_acc_list:    
            plt.plot(epochs, history.history[l], 'g', label='Validation accuracy (' + str(format(history.history[l][-1],'.5f'))+')')

        plt.title('Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.show()

## Example using phone_accel data

### Apply the Reader class:

In [113]:
df = Reader("phone_accel/",mode='d').df # read entire directory into dataframe

### Apply the Model class:
#### Evaluate model with 70/30 train/test split

In [5]:
# list attributes and label of choice
attributes = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9',
       'Y0', 'Y1', 'Y2', 'Y3', 'Y4', 'Y5', 'Y6', 'Y7', 'Y8', 'Y9', 'Z0', 'Z1',
       'Z2', 'Z3', 'Z4', 'Z5', 'Z6', 'Z7', 'Z8', 'Z9', 'XAVG', 'YAVG', 'ZAVG',
       'XPEAK', 'YPEAK', 'ZPEAK', 'XABSOLDEV', 'YABSOLDEV', 'ZABSOLDEV',
       'XSTANDDEV', 'YSTANDDEV', 'ZSTANDDEV', 'XVAR', 'YVAR', 'ZVAR', 'RESULTANT']
label = 'ACTIVITY'
testsize = 0.3 # use a 70/30 train/test split

# initiate and run model
model = Model(df)
model.preprocess(attributes, label, testsize)

model.train(kfold=False) # Use train/test split for training instead of kfold
model.evaluate()
estimator1 = model.estimator

In [106]:
activity_num = 6
left = activity_num * 18
right = activity_num * 18 + 18
arr = estimator1.predict(model.X[left:right])
pd.DataFrame(model.Y)
pd.options.display.max_rows = 18
# df[left:right]
letter = model.encoder.inverse_transform([np.argmax(np.bincount(arr))])[0]

classes = {"A":"Walking","B":"Jogging","C":"Stairs","D":"Sitting",
               "E":"Standing","F":"Typing","G":"Brushing teeth","H":"Eating soup",
               "I":"Eating chips","J":"Eating pasta","K":"Drinking from a cup",
               "L":"Eating sandwich","M":"Kicking (soccer ball)","O":"Playing catch (with tennis ball)",
               "P":"Dribbling (basketball)", "Q":"Writing","R":"Clapping","S":"Folding clothes"}

classes[letter]

'Brushing teeth'

In [123]:
# Apply the Reader class:
dff = Reader("data.arff",mode='f').df # read datafile into dataframe
dff[0].columns = dff[1] #assign column names
dff[0]
# model2 = Model(dff[0])
# model.preprocess(attributes, label, testsize)

SAMPLE,ACTIVITY,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,ZMFCC11,ZMFCC12,XYCOS,XZCOS,YZCOS,XYCOR,XZCOR,YZCOR,RESULTANT,class
0,A,0.235,0.47,0.275,0.02,0.0,0.0,0.0,0.0,0.0,...,0.486106,0.479859,-0.550668,0.049864,0.121354,-0.251024,0.164468,-0.110722,10.0518,1600.0
1,A,0.275,0.44,0.27,0.015,0.0,0.0,0.0,0.0,0.0,...,0.479571,0.473409,-0.633171,0.072129,0.161492,-0.386416,0.21568,-0.034375,10.1171,1600.0
2,A,0.32,0.43,0.245,0.0,0.005,0.0,0.0,0.0,0.0,...,0.483005,0.476798,-0.659493,0.087043,0.162157,-0.325151,0.27238,-0.077274,9.98384,1600.0
3,A,0.315,0.495,0.185,0.005,0.0,0.0,0.0,0.0,0.0,...,0.480711,0.474534,-0.712081,0.00381,0.210015,-0.364285,0.203131,0.015328,10.106,1600.0
4,A,0.215,0.455,0.325,0.005,0.0,0.0,0.0,0.0,0.0,...,0.468836,0.462811,-0.534933,0.047553,0.275833,-0.216423,0.2385,-0.00987,10.0521,1600.0
5,A,0.3,0.48,0.21,0.01,0.0,0.0,0.0,0.0,0.0,...,0.478809,0.472657,-0.639058,0.110447,0.079768,-0.301353,0.236295,-0.115868,10.1221,1600.0
6,A,0.34,0.48,0.165,0.015,0.0,0.0,0.0,0.0,0.0,...,0.467979,0.461966,-0.694697,0.116872,0.121403,-0.287187,0.276281,-0.004214,10.1414,1600.0
7,A,0.305,0.555,0.14,0.0,0.0,0.0,0.0,0.0,0.0,...,0.472277,0.466208,-0.724125,-0.018444,0.155935,-0.30075,0.132754,0.009092,10.1264,1600.0
8,A,0.37,0.495,0.13,0.005,0.0,0.0,0.0,0.0,0.0,...,0.466374,0.460381,-0.756322,0.066611,0.193208,-0.344464,0.290602,0.061446,10.0887,1600.0
9,A,0.345,0.52,0.12,0.015,0.0,0.0,0.0,0.0,0.0,...,0.453245,0.447421,-0.7009,-0.060815,0.23277,-0.283501,0.121288,0.071633,10.1591,1600.0


In [114]:
df

SAMPLE,ACTIVITY,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,ZMFCC11,ZMFCC12,XYCOS,XZCOS,YZCOS,XYCOR,XZCOR,YZCOR,RESULTANT,class
0,A,0.235,0.470,0.275,0.020,0.000,0.0,0.0,0.0,0.0,...,0.486106,0.479859,-0.550668,0.049864,0.121354,-0.251024,0.164468,-0.110722,10.05180,1600.0
1,A,0.275,0.440,0.270,0.015,0.000,0.0,0.0,0.0,0.0,...,0.479571,0.473409,-0.633171,0.072129,0.161492,-0.386416,0.215680,-0.034375,10.11710,1600.0
2,A,0.320,0.430,0.245,0.000,0.005,0.0,0.0,0.0,0.0,...,0.483005,0.476798,-0.659493,0.087043,0.162157,-0.325151,0.272380,-0.077274,9.98384,1600.0
3,A,0.315,0.495,0.185,0.005,0.000,0.0,0.0,0.0,0.0,...,0.480711,0.474534,-0.712081,0.003810,0.210015,-0.364285,0.203131,0.015328,10.10600,1600.0
4,A,0.215,0.455,0.325,0.005,0.000,0.0,0.0,0.0,0.0,...,0.468836,0.462811,-0.534933,0.047553,0.275833,-0.216423,0.238500,-0.009870,10.05210,1600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23069,S,1.000,0.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,...,0.208123,0.205448,0.965885,-0.006364,-0.192099,-0.708716,0.720199,-0.651551,9.40653,1650.0
23070,S,1.000,0.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,...,0.247499,0.244319,0.965439,0.861812,0.919008,-0.792718,-0.641690,0.547467,9.41056,1650.0
23071,S,1.000,0.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,...,0.320125,0.316012,0.942726,0.542231,0.357909,-0.542299,0.309224,-0.571912,9.44613,1650.0
23072,S,1.000,0.000,0.000,0.000,0.000,0.0,0.0,0.0,0.0,...,0.276562,0.273008,0.932298,-0.270526,-0.518026,-0.797365,0.813487,-0.677182,9.44593,1650.0
