In [1]:
import pandas as pd
from pandas import read_csv
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier

# start with linear classifiers, non-linear ones and eventually the more complex neural nets

In [2]:
# load a single file as a numpy array
# def load_file(filepath):
#     dataframe = read_csv(filepath, header=None, delim_whitespace=True)
#     return dataframe.values

# load a dataset group, such as train or test
# def load_dataset_group(group, prefix=''):
#     # load input data
#     X = load_file(prefix + group + '/X_'+group+'.txt')
#     # load class output
#     y = load_file(prefix + group + '/y_'+group+'.txt')
#     return X, y

# load the dataset, returns train and test X and y elements
# def load_dataset(prefix=''):
#     # load all train
#     trainX, trainy = load_dataset_group('train', prefix + 'HARDataset/')
#     print(trainX.shape, trainy.shape)
#     # load all test
#     testX, testy = load_dataset_group('test', prefix + 'HARDataset/')
#     print(testX.shape, testy.shape)
#     # flatten y
#     trainy, testy = trainy[:,0], testy[:,0]
#     print(trainX.shape, trainy.shape, testX.shape, testy.shape)
#     return trainX, trainy, testX, testy

# def load_dataset(prefix=''):
#     # load all train
#     train = load_dataset_group('train', prefix + 'HARDataset/')
#     print(trainX.shape, trainy.shape)
#     # load all test
#     testX, testy = load_dataset_group('test', prefix + 'HARDataset/')
#     print(testX.shape, testy.shape)
#     # flatten y
#     trainy, testy = trainy[:,0], testy[:,0]
#     print(trainX.shape, trainy.shape, testX.shape, testy.shape)
#     return trainX, trainy, testX, testy

In [3]:
# path = '../data/processed/flori5hz.csv'
# values = load_file(path)
# #X, y = load_dataset_group()
# values

In [2]:
train_df = pd.read_csv('../data/processed/train.csv')

test_df = pd.read_csv('../data/processed/test.csv')

In [3]:
X_train = train_df.drop('state', axis=1)
y_train = train_df['state']
X_test = test_df.drop('state', axis=1)
y_test = test_df['state']

In [4]:
# create a dict of standard models to evaluate {name:object}
def define_models(models=dict()):
    # nonlinear models
    models['knn'] = KNeighborsClassifier(n_neighbors=15)
    models['cart'] = DecisionTreeClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    # ensemble models
    models['bag'] = BaggingClassifier(n_estimators=100)
    models['rf'] = RandomForestClassifier(n_estimators=100)
    models['et'] = ExtraTreesClassifier(n_estimators=100)
    models['gbm'] = GradientBoostingClassifier(n_estimators=100)
    # sgd is sensitive to feature scaling
    models['sgd'] = SGDClassifier()
    models['gp'] = GaussianProcessClassifier()
    models['mlp'] = MLPClassifier()
    print('Defined %d models' % len(models))
    return models

In [5]:
# get model list
models = define_models()

Defined 11 models


In [6]:
# evaluate a single model
def evaluate_model(trainX, trainy, testX, testy, model):
    # fit the model
    model.fit(trainX, trainy)
    # make predictions
    yhat = model.predict(testX)
    # evaluate predictions
    accuracy = balanced_accuracy_score(testy, yhat)
    #roc_auc = roc_auc_score(testy, yhat)
    return accuracy * 100.0
    #return roc_auc * 100.0

# evaluate a dict of models {name:object}, returns {name:score}
def evaluate_models(trainX, trainy, testX, testy, models):
    results = dict()
    for name, model in models.items():
        # evaluate the model
        results[name] = evaluate_model(trainX, trainy, testX, testy, model)
        # show process
        print('>%s: %.3f' % (name, results[name]))
    return results

# print and plot the results
def summarize_results(results, maximize=True):
    # create a list of (name, mean(scores)) tuples
    mean_scores = [(k,v) for k,v in results.items()]
    # sort tuples by mean score
    mean_scores = sorted(mean_scores, key=lambda x: x[1])
    # reverse for descending order (e.g. for accuracy)
    if maximize:
        mean_scores = list(reversed(mean_scores))
    print()
    for name, score in mean_scores:
        print('Name=%s, Score=%.3f' % (name, score))

In [7]:
# evaluate models
results = evaluate_models(X_train, y_train, X_test, y_test, models)

>rf: 55.820




>svm: 25.000




>mlp: 61.202
>gp: 44.037
>cart: 49.041
>et: 60.189
>knn: 42.058
>bag: 59.287
>bayes: 40.601
>gbm: 64.374
>sgd: 25.848




In [14]:
### summarize results
summarize_results(results)


Name=gbm, Score=64.542
Name=et, Score=60.521
Name=bag, Score=58.618
Name=rf, Score=56.030
Name=cart, Score=49.793
Name=bayes, Score=40.601
Name=knn, Score=32.264
Name=svm, Score=25.000
Name=rnn, Score=25.000


In [53]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
y_pred
acc_random_forest = round(random_forest.score(X_test, y_test) * 100, 2)

cnt = 0
sitting = 0
standing = 0
for i in range(0, len(y_pred)):
    if not y_pred[i] == y_test[i]:
        cnt += 1
        if y_pred[i] == 1:
            sitting += 1
        else:
            standing += 1
        print('predicted', y_pred[i], 'actual', y_test[i])
        
print('\nTotal:', len(y_test))
print('Wrong:', cnt)
print('Accuracy:', acc_random_forest)
print('False positive:', sitting)
print('False negative:', standing)

predicted 0 actual 1
predicted 0 actual 1
predicted 0 actual 1
predicted 0 actual 1
predicted 0 actual 1
predicted 0 actual 1
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 0 actual 1
predicted 0 actual 1
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 actual 0
predicted 1 a

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))



0.9321824907521579


In [74]:
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

y_all = train_df['state']
X_all = train_df.drop(['state'], axis=1)
            
def run_kfold(clf):
    kf = KFold(n_splits=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf.split(X_all):
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = roc_auc_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))  
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_kfold(clf)

Fold 1 accuracy: 0.8263403263403264
Fold 2 accuracy: 0.8705299834953236
Fold 3 accuracy: 0.8874085145861251
Fold 4 accuracy: 0.6933730715287517
Fold 5 accuracy: 0.7494215077335283
Fold 6 accuracy: 0.8207166587565259
Fold 7 accuracy: 0.7740182328190742
Fold 8 accuracy: 0.7897727272727273
Fold 9 accuracy: 0.7609765899728725
Fold 10 accuracy: 0.9246376811594202
Mean Accuracy: 0.8097195293664676


In [78]:
predictions = clf.predict(X_test)
print(roc_auc_score(y_test, predictions))

0.528693111487044
