In [1]:
import pandas as pd
from pandas import read_csv
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier

# start with linear classifiers, non-linear ones and eventually the more complex neural nets

In [2]:
# load a single file as a numpy array
# def load_file(filepath):
#     dataframe = read_csv(filepath, header=None, delim_whitespace=True)
#     return dataframe.values

# load a dataset group, such as train or test
# def load_dataset_group(group, prefix=''):
#     # load input data
#     X = load_file(prefix + group + '/X_'+group+'.txt')
#     # load class output
#     y = load_file(prefix + group + '/y_'+group+'.txt')
#     return X, y

# load the dataset, returns train and test X and y elements
# def load_dataset(prefix=''):
#     # load all train
#     trainX, trainy = load_dataset_group('train', prefix + 'HARDataset/')
#     print(trainX.shape, trainy.shape)
#     # load all test
#     testX, testy = load_dataset_group('test', prefix + 'HARDataset/')
#     print(testX.shape, testy.shape)
#     # flatten y
#     trainy, testy = trainy[:,0], testy[:,0]
#     print(trainX.shape, trainy.shape, testX.shape, testy.shape)
#     return trainX, trainy, testX, testy

# def load_dataset(prefix=''):
#     # load all train
#     train = load_dataset_group('train', prefix + 'HARDataset/')
#     print(trainX.shape, trainy.shape)
#     # load all test
#     testX, testy = load_dataset_group('test', prefix + 'HARDataset/')
#     print(testX.shape, testy.shape)
#     # flatten y
#     trainy, testy = trainy[:,0], testy[:,0]
#     print(trainX.shape, trainy.shape, testX.shape, testy.shape)
#     return trainX, trainy, testX, testy

In [3]:
# path = '../data/processed/flori5hz.csv'
# values = load_file(path)
# #X, y = load_dataset_group()
# values

In [126]:
train_df = pd.read_csv('../data/processed/train.csv')

test_df = pd.read_csv('../data/processed/test.csv')

In [127]:
X_train = train_df.drop(['state', 'name'], axis=1)
y_train = train_df['state']
X_test = test_df.drop(['state', 'name'], axis=1)
y_test = test_df['state']

In [134]:
# create a dict of standard models to evaluate {name:object}
def define_models(models=dict()):
    # nonlinear models
    models['knn'] = KNeighborsClassifier()
    models['cart'] = DecisionTreeClassifier()
    #models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    # ensemble models
#     models['bag'] = BaggingClassifier(n_estimators=100)
#     models['rf'] = RandomForestClassifier(n_estimators=100)
#     models['et'] = ExtraTreesClassifier(n_estimators=100)
#     models['gbm'] = GradientBoostingClassifier(n_estimators=100)
    models['bag'] = BaggingClassifier()
    models['rf'] = RandomForestClassifier()
    models['et'] = ExtraTreesClassifier()
    models['gbm'] = GradientBoostingClassifier()
    # sgd is sensitive to feature scaling
    #models['sgd'] = SGDClassifier()
    models['gp'] = GaussianProcessClassifier()
    models['mlp'] = MLPClassifier()
    print('Defined %d models' % len(models))
    return models

In [135]:
# get model list
models = define_models()

Defined 9 models


In [136]:
# evaluate a single model
def evaluate_model(trainX, trainy, testX, testy, model):
    # fit the model
    model.fit(trainX, trainy)
    # make predictions
    yhat = model.predict(testX)
    # evaluate predictions
    accuracy = balanced_accuracy_score(testy, yhat)
    #roc_auc = roc_auc_score(testy, yhat)
    return accuracy * 100.0
    #return roc_auc * 100.0

# evaluate a dict of models {name:object}, returns {name:score}
def evaluate_models(trainX, trainy, testX, testy, models):
    results = dict()
    for name, model in models.items():
        # evaluate the model
        results[name] = evaluate_model(trainX, trainy, testX, testy, model)
        # show process
        print('>%s: %.3f' % (name, results[name]))
    return results

# print and plot the results
def summarize_results(results, maximize=True):
    # create a list of (name, mean(scores)) tuples
    mean_scores = [(k,v) for k,v in results.items()]
    # sort tuples by mean score
    mean_scores = sorted(mean_scores, key=lambda x: x[1])
    # reverse for descending order (e.g. for accuracy)
    if maximize:
        mean_scores = list(reversed(mean_scores))
    print()
    for name, score in mean_scores:
        print('Name=%s, Score=%.3f' % (name, score))

In [137]:
# evaluate models
results = evaluate_models(X_train, y_train, X_test, y_test, models)

>cart: 54.852
>gp: 47.742
>bag: 66.819
>et: 53.895
>bayes: 64.270




>mlp: 56.845
>knn: 51.623
>gbm: 63.306
>rf: 57.911




In [139]:
### summarize results
summarize_results(results)


Name=bag, Score=66.819
Name=bayes, Score=64.270
Name=gbm, Score=63.306
Name=rf, Score=57.911
Name=mlp, Score=56.845
Name=cart, Score=54.852
Name=et, Score=53.895
Name=knn, Score=51.623
Name=gp, Score=47.742


In [21]:
# Random Forest

# random_forest = RandomForestClassifier(n_estimators=100)
# random_forest.fit(X_train, y_train)
# y_pred = random_forest.predict(X_test)
# y_pred
# acc_random_forest = round(random_forest.score(X_test, y_test) * 100, 2)

# cnt = 0
# sitting = 0
# standing = 0
# for i in range(0, len(y_pred)):
#     if not y_pred[i] == y_test[i]:
#         cnt += 1
#         if y_pred[i] == 1:
#             sitting += 1
#         else:
#             standing += 1
#         print('predicted', y_pred[i], 'actual', y_test[i])
        
# print('\nTotal:', len(y_test))
# print('Wrong:', cnt)
# print('Accuracy:', acc_random_forest)
# print('False positive:', sitting)
# print('False negative:', standing)

In [148]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.model_selection import GridSearchCV

X_train = train_df.drop(['state', 'name'], axis=1)
y_train = train_df['state']
X_test = test_df.drop(['state', 'name'], axis=1)
y_test = test_df['state']

groups = train_df['name']

logo = LeaveOneGroupOut()

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [100], 
              'max_features': ['log2', 'sqrt','auto', None], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(balanced_accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer, cv=logo.split(X_train, groups=groups))
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
print(balanced_accuracy_score(y_test, predictions))

ValueError: Invalid parameter criterion for estimator BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [147]:
grid_obj.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 100}

In [109]:
grid_obj.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [150]:
from sklearn.model_selection import LeaveOneGroupOut
from pandas import DataFrame
import numpy as np

X_train = train_df.drop(['state', 'name'], axis=1)
y_train = train_df['state']
X_test = test_df.drop(['state', 'name'], axis=1)
y_test = test_df['state']

X_all = X_train.append(X_test)
y_all = y_train.append(y_test)

groups = train_df.append(test_df)['name']

outcomes = []

clf = BaggingClassifier()

def run_logo(clf):
    logo = LeaveOneGroupOut()
    group = 0
    
    for train_index, test_index in logo.split(X_all, groups=groups):
        group += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = balanced_accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Group {0} accuracy: {1}".format(group, accuracy))
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_logo(clf)


Group 1 accuracy: 0.5016320013232272
Group 2 accuracy: 0.3587117027893027
Group 3 accuracy: 0.5619605900169492
Group 4 accuracy: 0.6390848714852511
Mean Accuracy: 0.5153472914036825


In [145]:
print('what')

what


In [93]:
# Testing Leave-One-Group-Out CV
from sklearn.model_selection import LeaveOneGroupOut
from pandas import DataFrame

X_train = train_df.drop(['state', 'name'], axis=1)
y_train = train_df['state']
X_test = test_df.drop(['state', 'name'], axis=1)
y_test = test_df['state']

groups = train_df['name']

logo = LeaveOneGroupOut()

for train, test in logo.split(X_train, y_train, groups=groups):
    print("%s" % ( train))

[   0    1    2 ... 1706 1707 1708]
[1112 1113 1114 ... 2314 2315 2316]
[   0    1    2 ... 2314 2315 2316]
