In [None]:
# Code running experiments with different models and parameters
import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from sklearn.metrics import balanced_accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from numpy import argmax
import numpy as np

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
## A separate test set is kept for consistent comparison of models (eg. its semi-supervised counterpart)

train_df = pd.read_csv('../data/processed/train.csv')
# test_df = pd.read_csv('../data/processed/test.csv')

X_train = train_df.drop(['state', 'name'], axis=1)
y_train = pd.DataFrame(train_df['state'])

# X_test = test_df.drop(['state', 'name'], axis=1)
# y_test = pd.DataFrame(test_df['state'])

In [None]:
def define_models(models=dict()):
    # nonlinear models
#     models['knn'] = KNeighborsClassifier()
#     models['cart'] = DecisionTreeClassifier()
    models['svm'] = SVC()
#     models['bayes'] = GaussianNB()
# #     models['mnb'] = MultinomialNB()
# #     models['cnb'] = ComplementNB()
#     # ensemble models
#     models['rf'] = RandomForestClassifier()
#     models['et'] = ExtraTreesClassifier()
    models['gbm'] = GradientBoostingClassifier()
#     models['bag'] = BaggingClassifier()
    models['mlp'] = MLPClassifier()
    print('Defined %d models' % len(models))
    return models

# print and plot the results
def summarize_results(results, maximize=True):
    # create a list of (name, mean(scores)) tuples
    mean_scores = [(k,v) for k,v in results.items()]
    # sort tuples by mean score
    mean_scores = sorted(mean_scores, key=lambda x: x[1])
    # reverse for descending order (e.g. for accuracy)
    if maximize:
        mean_scores = list(reversed(mean_scores))
    print()
    for name, score in mean_scores:
        print('Name=%s, Score=%.3f' % (name, score))

In [None]:
models = define_models()

In [None]:
from sklearn.model_selection import LeaveOneGroupOut

def evaluate_model(trainX, trainy, testX, testy, model):
    trainy, testy = trainy[:,0], testy[:,0]
    # fit the model
    model.fit(trainX, trainy)
    # make predictions
    yhat = model.predict(testX)

#     hat, test = 0,0
#     for i in range(len(yhat)):
#         if yhat[i] == 0:
#             hat += 1
#         if testy[i][0] == 0:
#             test += 1
#     print('hat', hat)
#     print('test', test)
    # evaluate predictions
#     accuracy = balanced_accuracy_score(testy, yhat)
    f1_macro = f1_score(testy, yhat, average='macro')
    f1_micro = f1_score(testy, yhat, average='micro')
    f1 = f1_score(testy, yhat, average=None)

#     print('accuracy:', accuracy)
#     print(classification_report(testy, yhat))
#     print(confusion_matrix(testy, yhat))
    return f1, f1_macro, f1_micro

def run_logo(clf, X_all, y_all, groups):
    logo = LeaveOneGroupOut()
    group = 0
    
    f1s = []
    f1_macros = []
    f1_micros = []
    for train_index, test_index in logo.split(X_all, groups=groups):
        group += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        f1, f1_macro, f1_micro = evaluate_model(X_train, y_train, X_test, y_test, clf)
        f1s.append(f1)
        f1_macros.append(f1_macro)
        f1_micros.append(f1_micro)
#         print("Group {0} f1-scores: {1}".format(group, f1))
#         print(f1_macros)
#         print(f1_micros)
        
#     return np.average(f1s, axis=0).tolist()
    return np.average(f1s, axis=0).tolist(), [np.average(f1_macros).tolist()], [np.average(f1_micros).tolist()]

def evaluate_models(trainX, trainy, models, groups):
    results = dict()
    for name, model in models.items():
        results[name] = run_logo(model, trainX, trainy, groups)
        # show process
        print(name, results[name])
    return results

In [None]:
## Run LOGO CV for all models with no hyperparameter tuning and show f1-scores
## These results serves as a baseline


In [None]:
from os import listdir
from os import path
import re
import csv

source = '../data/processed/sklearn'
dest = 'results'
i = 1
for name in listdir(source):
    filename = source + '/' + name
    if not name.endswith('csv') or not name.startswith('train'):
        continue
    pattern = 'train' + '[0-9]*[0-9]_[0-9]*[0-9]_[0-9]*[0-9]' + '.csv'
    match = re.search(pattern, name)
    if not match:
        continue
        
    print(i, 'out of 79 files')
    print('Reading file', name)

    train_df = pd.read_csv(filename)

    X_train = train_df.drop(['state', 'name'], axis=1)
    y_train = pd.DataFrame(train_df['state'])
    groups = train_df['name']
    
    # counting the number of samples per class
    freq = [0,0,0,0]
    for val in y_train['state']:
        freq[val] += 1
    
    print('class frequencies', freq)
    
    results = evaluate_models(X_train, y_train, models, groups)
        
    # save results and freq into file per model
    combi = re.search('train(.+?).csv', name)
    if combi:
        combi = combi.group(1)
        
#     with open(dest + '/combis', 'a+') as outFile:
#         row = [combi]
#         row.extend(freq)
#         writer = csv.writer(outFile)
#         writer.writerow(row)
    
    header = ['comb', 'f1-0', 'f1-1', 'f1-2', 'f1-2', 'macro f1', 'micro f1']

    for key in results:
        row = [combi]
        for e in results[key]:
            row.extend(e)
        
        outFilename = dest + '/' + key + '.csv'
        if not path.isfile(outFilename):
            print('creating new file', outFilename)
            with open(outFilename, 'w') as outFile:
                writer = csv.writer(outFile)
                writer.writerow(header)
                writer.writerow(row)
        else:
#         row = [combi]
#         row.extend(freq)
#         writer = csv.writer(outFile)
#         writer.writerow(row)
            print('opening existing file', outFilename)
            with open(outFilename, 'a+') as outFile:
                writer = csv.writer(outFile)
                writer.writerow(row)
    i += 1



In [None]:
# experiments keras neural network models
# from numpy import mean
# from numpy import std
# from numpy import dstack
# import numpy as np
# from pandas import read_csv
# import tensorflow as tf
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Flatten
# from keras.layers import Dropout
# from keras.layers import LSTM
# from keras.utils import to_categorical
# from keras.layers import Bidirectional

# from matplotlib import pyplot

# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report

In [None]:
# load a single file as a numpy array
# def load_file(filepath):
#     dataframe = read_csv(filepath, header=None)
#     return dataframe.values

# # load a list of files and return as a 3d numpy array
# def load_group(filenames, prefix=''):
#     loaded = list()
#     for name in filenames:
#         data = load_file(prefix + name)
#         loaded.append(data)
#     # stack group so that features are the 3rd dimension
#     print('loaded', len(loaded[0]))
#     print('loaded', loaded)
#     loaded = dstack(loaded)
#     print('stacked', loaded.shape)
#     print('stacked', loaded)
#     return loaded

# # load a dataset group, such as train or test
# def load_dataset_group(group, freq, win, prefix=''):
#     filepath = prefix + group + '/'
#     # load all 9 files as a single array
#     filenames = list()
#     # total acceleration
#     filenames += ['total_acc_x_'+group+'_'+win+'_'+freq+'.csv', 'total_acc_y_'+group+'_'+win+'_'+freq+'.csv', 'total_acc_z_'+group+'_'+win+'_'+freq+'.csv']
#     # load input data
#     X = load_group(filenames, filepath)
#     # load class output
#     y = load_file(prefix + group + '/state_'+group+'_'+win+'_'+freq+'.csv')
#     print('X:', filenames)
#     print('y:', prefix + group + '/state_'+group+'_'+win+'_'+freq+'.csv')
#     return X, y

# # load the dataset, returns train and test X and y elements
# def load_dataset(freq, win, prefix=''):
#     # load all train
#     trainX, trainy = load_dataset_group('train', freq, win, prefix)
#     print(trainX.shape, trainy.shape)
#     # load all test
#     testX, testy = load_dataset_group('test', freq, win, prefix)
#     print(testX.shape, testy.shape)
#     # zero-offset class values
# #     trainy = trainy - 1
# #     testy = testy - 1
#     # one hot encode y
#     trainy = to_categorical(trainy)
#     y_true = testy
#     testy = to_categorical(testy)
#     print(trainX.shape, trainy.shape, testX.shape, testy.shape)
#     return trainX, trainy, testX, testy, y_true

# # summarize scores
# def summarize_results(scores):
#     print(scores)
#     m, s = mean(scores), std(scores)
#     print('Accuracy: %.3f%% (+/-%.3f)' % (m, s))

In [None]:
# lstm model

# fit and evaluate a model
# def evaluate_model(trainX, trainy, testX, testy):
#     verbose, epochs, batch_size = 1, 20, 10
#     n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainy.shape[1]
#     model = Sequential()
#     model.add(LSTM(100, input_shape=(n_timesteps,n_features)))
#     model.add(Dropout(0.5))
#     model.add(Dense(100, activation='relu'))
#     model.add(Dense(n_outputs, activation='softmax'))
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
#     # fit network
# #     model.fit(trainX, trainy, validation_split=0.2, epochs=epochs, batch_size=batch_size, verbose=verbose)
#     model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, verbose=verbose)
#     # evaluate model
#     _, accuracy = model.evaluate(testX, testy, batch_size=batch_size, verbose=1)
#     prediction = model.predict_classes(testX)
#     return prediction, accuracy, model

# # run an experiment
# def run_experiment(freq, win, repeats=1):
#     # load data
#     trainX, trainy, testX, testy, y_true = load_dataset(freq=freq, win=win, prefix='../data/processed/')
# #     score = evaluate_model(trainX, trainy, testX, testy)
# #     print(score)
#     # repeat experiment
#     scores = list()
# #     for r in range(repeats):
#     pred_classes, score, model = evaluate_model(trainX, trainy, testX, testy)
#     print(score)
#     score = score * 100.0
#     print('>#%d: %.3f' % (1, score))
#     scores.append(score)
# #     print(classification_report(y_true, pred_classes))
# #     print(confusion_matrix(y_true, pred_classes))
#     # summarize results
#     summarize_results(scores)
#     return pred_classes, y_true, model

# # run the experiment
# freq = '50'
# win = '50'
# pred_classes, y_true, model = run_experiment(freq, win)