In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
from sklearn.preprocessing import Normalizer
from sklearn import svm, metrics
from sklearn.ensemble import RandomForestClassifier

In [4]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [5]:
def printbest_model(param_space):
    
    hypopt_trials = Trials()
    best_params = fmin(fn=objective, space=param_space, algo=tpe.suggest, max_evals=15, trials= hypopt_trials)
    
    print(best_params)
    print(hypopt_trials.best_trial['result']['loss'])
    
    return hypopt_trials.results[np.argmin([r['loss'] for r in hypopt_trials.results])]['model']

In [6]:
def objective(params):
    
    model = RandomForestClassifier(**params, n_jobs=-1, random_state=123)
    acc = cross_val_score(model, X_train, y_train, cv=5).mean()
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}

In [7]:
def get_f1_score(X_train, y_train, X_test, y_test, model):

    assert X_train.shape[0] == y_train.shape[0]
    assert X_test.shape[0] == y_test.shape[0]

    X_train = Normalizer().fit_transform(X_train)
    X_test = Normalizer().fit_transform(X_test)

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    f1score = metrics.f1_score(y_test, preds, average='macro') * 100

    print(f'f1-score on features: {f1score} \n')
    return f1score

In [22]:
def get_stratified_kfold_f1score(X_train, y_train, model):

  skf = StratifiedKFold(n_splits=5)
  f1_scores = []
  for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

    f1_scores.append(get_f1_score(X_train_fold, y_train_fold, X_test_fold, y_test_fold, model))

  print(f'stratified kfold f1score: {np.mean(f1_scores)}')

In [9]:
def feature_sum(vec_list):
    vec_list = np.array(vec_list)
    vec_sum = vec_list[0]
    try:
        for idx in range(1, len(vec_list)):
            vec_sum += vec_list[idx]
    except:
        print(vec_list)
        exit()
    return vec_sum.tolist()


def mean_feature_sum(vec_list):
    n = len(vec_list)
    vec_list_sum = np.array(feature_sum(vec_list))
    vec_list_sum = vec_list_sum/n
    return vec_list_sum.tolist()

In [10]:
def get_svm_wordembds():

    print('................ svm classifier.........')

    model = svm.SVC(kernel='poly', random_state=123)
    return model

In [11]:
def get_rfc_wordembds():

    print('................ rfc classifier.........')

    model = RandomForestClassifier(n_estimators=200, random_state=123)
    return model

In [12]:
def perform_classification_on_rawfeatures(X_train, y_train, X_test, y_test):

    print('................ before clustering.........')

    svc_model = get_svm_wordembds()
    get_f1_score(X_train, y_train, X_test, y_test, svc_model)
    rfc_model = get_rfc_wordembds()
    get_f1_score(X_train, y_train, X_test, y_test, rfc_model)

In [13]:
def extract_feature_info_lstmdata(label_cnt, step_cnt, features, subject_activity_data):

    X = []
    y = []

    for idx in range(label_cnt):
        class_label = subject_activity_data[idx*step_cnt][1]
        y.append(class_label)

        lower_lim = idx*step_cnt
        upper_lim = ((idx+1)*step_cnt)
        temp = []

        for val in range(6):
            temp.append(mean_feature_sum(features[val,lower_lim:upper_lim,:]))

        X.append(mean_feature_sum(temp))

    return X, y

In [14]:
def perform_tuning_on_rawfeatures():

  param_space_rfc = {
          'criterion': hp.choice('criterion', ['gini', 'entropy']),
          'max_depth': hp.choice('max_depth', [10, 13, 15, 17, 20]),
          'min_samples_split': hp.choice('min_samples_split', [4,5,6]),
          'min_samples_leaf': hp.choice('min_samples_leaf', [2, 3]),
          'max_features': hp.choice('max_features', ['log2']),
          'bootstrap': hp.choice('bootstrap', ['True']),
          'n_estimators': hp.choice('n_estimators', [450, 500, 520, 550, 600])
          }

  best_model_rfc = printbest_model(param_space_rfc)
  return best_model_rfc  

In [15]:
def perform_clf(features_train, features_test, subject_activity_data_train, subject_activity_data_test):

    train_label_cnt = 7352
    test_label_cnt = 2947
    feature_dim = features_train.shape[2]
    step_cnt = int(features_train.shape[1]/train_label_cnt)

    X_train, y_train = extract_feature_info_lstmdata(train_label_cnt, step_cnt, features_train, subject_activity_data_train)
    X_test, y_test = extract_feature_info_lstmdata(test_label_cnt, step_cnt, features_test, subject_activity_data_test)

    X_train = np.array(X_train).reshape(-1,feature_dim).astype('float32')
    y_train = np.array(y_train).astype('int32')
    X_test = np.array(X_test).reshape(-1,feature_dim).astype('float32')
    y_test = np.array(y_test).astype('int32')

    perform_classification_on_rawfeatures(X_train, y_train, X_test, y_test)

    return X_train, y_train, X_test, y_test

In [16]:
subject_activity_data_train = np.loadtxt('/content/drive/MyDrive/lstm_data/activity_subject_data_train.csv', delimiter=',')
sensor_features_train = np.loadtxt('/content/drive/MyDrive/lstm_data/UCIHAR_sensor_features_lstm_tuned_train.csv', delimiter=',')
subject_activity_data_test = np.loadtxt('/content/drive/MyDrive/lstm_data/activity_subject_data_test.csv', delimiter=',')
sensor_features_test = np.loadtxt('/content/drive/MyDrive/lstm_data/UCIHAR_sensor_features_lstm_tuned_test.csv', delimiter=',')

train_channel_len = int(sensor_features_train.shape[0]/6)
test_channel_len = int(sensor_features_test.shape[0]/6)
feature_dim = sensor_features_train.shape[1]

features_train = sensor_features_train.reshape(6, train_channel_len, feature_dim)
features_test = sensor_features_test.reshape(6, test_channel_len, feature_dim)

In [17]:
X_train, y_train, X_test, y_test = perform_clf(features_train, features_test, subject_activity_data_train, subject_activity_data_test)

................ before clustering.........
................ svm classifier.........
f1-score on features: 80.48088742279141 

................ rfc classifier.........
f1-score on features: 79.76636250264872 



In [18]:
#best_model_rfc = perform_tuning_on_rawfeatures()
#best_model_rfc = RandomForestClassifier(n_estimators=450,criterion='entropy', max_depth=20, min_samples_split=6, min_samples_leaf=2, max_features='log2', bootstrap=True , n_jobs=-1, random_state=123)
#get_f1_score(X_train, y_train, X_test, y_test, best_model_rfc)

In [23]:
#perform_classification_on_rawfeatures(X_train, y_train, X_test, y_test)
get_stratified_kfold_f1score(X_train, y_train, get_rfc_wordembds())

................ rfc classifier.........
f1-score on features: 81.61649053489421 

f1-score on features: 77.61973781692694 

f1-score on features: 76.83806000795195 

f1-score on features: 76.91039285161033 

f1-score on features: 80.77099093977664 

stratified kfold f1score: 78.75113443023201
