#### Import necessary libraries and set display options

In [1]:
import warnings
warnings.filterwarnings("ignore")

import datetime as dt
import os
import pickle
import numpy as np
import pandas as pd
from math import ceil
from scipy import stats
import matplotlib.pyplot as plt
from detecta import detect_peaks
from scipy.signal import butter, lfilter
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import joblib


pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',200)

In [2]:
time_domain_df = pd.read_csv('../data/time_domain_windows.csv')
freq_domain_df = pd.read_csv('../data/freq_domain_windows.csv')
time_freq_domain_df = pd.read_csv('../data/time_freq_domain_windows.csv')
dct_domain_df = pd.read_csv('../data/dct_domain_windows.csv')

### 5. Splitting and Preprocessing

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, GroupShuffleSplit, GroupKFold, ParameterGrid, train_test_split, ShuffleSplit, BaseShuffleSplit, PredefinedSplit
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, make_scorer


In [4]:
def group_ml_pipe(X, y, groups, preprocessor, model, hyperparameters, score, 
                  randomized_iter, random_seed=1030):
    
    """A function to collate performance of multiple model runs and optimize
    hyperparameters through GridSearchCV.
    
    Parameters
    ----------
    X : Unprocessed feature matrix
    y : Target variable
    preprocessor : A column transformer object that defines preprocessing on 
                    each feature; pass None if not required
    model : Initialized model
    model_params : Dict object of hyperparameters/regularization params
                    to pass to GridSearchCV
    score : Sklearn's scorer object (or metric string) that specifies
            GridSearchCV scoring strategy
    randomized_iter : Number of random iterations to run through to 
                        pick optimized params.
      
    Returns
    -------
    trial_results : list of dictionaries with salient information of each GSCV run"""
    
    trial_results = []
    
    for i in range(1,randomized_iter+1):
        print('Running trial {}'.format(i))
        random_state = random_seed*i
        
        iter_info = {'trail_iter':i,
                     'random_state':random_state}
        
        gss = GroupShuffleSplit(n_splits=1, test_size=6, random_state=random_state)
        other_index, test_index = next(gss.split(X, y, groups=groups))

        X_other, y_other, groups_other = X.iloc[other_index], y.iloc[other_index], groups.iloc[other_index]
        X_test, y_test, groups_test = X.iloc[test_index], y.iloc[test_index], groups.iloc[test_index]

        gkf = GroupKFold(n_splits=7)

        pipe = make_pipeline(std_scaler, model)
        grid = GridSearchCV(pipe, param_grid=hyperparameters, scoring=score,
                            cv=gkf, return_train_score=True, verbose=True, n_jobs=-1)
        grid.fit(X_other, y_other, groups=groups_other)
        
        iter_info['grid'] = grid
        
        if grid.scorer_.__dict__['_sign']<0:
            best_score = abs(grid.score(X_test,y_test))
            maximized = False
        else:
            best_score = grid.score(X_test,y_test)
            maximized = True
        
        iter_info['best_test_score'] = {'score':best_score, 'maximized':maximized}
        iter_info['best_params'] = grid.best_params_
        iter_info['y_test_pred'] = grid.predict(X_test)
        iter_info['cv_results'] = grid.cv_results_
        
        trial_results.append(iter_info)
        print('Completed trial {}'.format(i))
        
    return trial_results


def gather_trial_results(df, exclftrs, target, groupftr, preprocessor, model, 
                         hyperparameters, score, trials, trial_type):
    X = df.drop(columns=exclftrs)
    y = df[target]
    groups = df[groupftr]
    trial_results = group_ml_pipe(X, y, groups, preprocessor, model, hyperparameters, score, 
                                  randomized_iter=trials)
    
    for result in trial_results:
        result['trial_type']=trial_type
    return trial_results

In [50]:
std_scaler = StandardScaler()
model = SVC()
hyperparameters = {'svc__C': [0.01, 0.1, 1, 10, 100],
                  'svc__gamma': [0.01, 0.05, 0.1, 0.5, 1]}
score = make_scorer(f1_score, average='macro')

# svc_timefreq_results = gather_trial_results(df=time_freq_domain_df, exclftrs=['epoch_start','epoch_end','class','user'],
#                                             target='class', groupftr='user', preprocessor=std_scaler, model=model,
#                                             hyperparameters=hyperparameters, score=score, 
#                                             trials=5, trial_type='time-freq')

# joblib.dump(svc_timefreq_results, '../results/svc_timefreq_results.pkl')


# svc_dct_results = gather_trial_results(df=dct_domain_df, exclftrs=['epoch_start','epoch_end','class','user'],
#                                        target='class', groupftr='user', preprocessor=std_scaler, model=model,
#                                        hyperparameters=hyperparameters, score=score, 
#                                        trials=5, trial_type='dct')

# joblib.dump(svc_dct_results, '../results/svc_dct_results.pkl')


Running trial 1
Fitting 7 folds for each of 25 candidates, totalling 175 fits
Completed trial 1
Running trial 2
Fitting 7 folds for each of 25 candidates, totalling 175 fits
Completed trial 2
Running trial 3
Fitting 7 folds for each of 25 candidates, totalling 175 fits
Completed trial 3
Running trial 4
Fitting 7 folds for each of 25 candidates, totalling 175 fits
Completed trial 4
Running trial 5
Fitting 7 folds for each of 25 candidates, totalling 175 fits
Completed trial 5
Running trial 1
Fitting 7 folds for each of 25 candidates, totalling 175 fits
Completed trial 1
Running trial 2
Fitting 7 folds for each of 25 candidates, totalling 175 fits
Completed trial 2
Running trial 3
Fitting 7 folds for each of 25 candidates, totalling 175 fits
Completed trial 3
Running trial 4
Fitting 7 folds for each of 25 candidates, totalling 175 fits
Completed trial 4
Running trial 5
Fitting 7 folds for each of 25 candidates, totalling 175 fits
Completed trial 5


['../results/svc_dct_results.pkl']

In [53]:
from sklearn.ensemble import RandomForestClassifier

std_scaler = StandardScaler()
model = RandomForestClassifier()
hyperparameters = {
    'randomforestclassifier__max_features': [0.25, 0.5, 0.75, 1.0],
    'randomforestclassifier__max_depth': [2, 4, 6, 8, 10, 12, 14]
}

score = make_scorer(f1_score, average='macro')


# rfc_timefreq_results = gather_trial_results(df=time_freq_domain_df, exclftrs=['epoch_start','epoch_end','class','user'],
#                                             target='class', groupftr='user', preprocessor=std_scaler, model=model,
#                                             hyperparameters=hyperparameters, score=score,
#                                             trials=5, trial_type='time-freq')
# joblib.dump(rfc_timefreq_results, '../results/rfc_timefreq_results.pkl')


# rfc_dct_results = gather_trial_results(df=dct_domain_df, exclftrs=['epoch_start','epoch_end','class','user'],
#                                        target='class', groupftr='user', preprocessor=std_scaler, model=model,
#                                        hyperparameters=hyperparameters, score=score,
#                                        trials=5, trial_type='dct')
# joblib.dump(rfc_dct_results, '../results/rfc_dct_results.pkl')



Running trial 1
Fitting 7 folds for each of 28 candidates, totalling 196 fits
Completed trial 1
Running trial 2
Fitting 7 folds for each of 28 candidates, totalling 196 fits
Completed trial 2
Running trial 3
Fitting 7 folds for each of 28 candidates, totalling 196 fits
Completed trial 3
Running trial 4
Fitting 7 folds for each of 28 candidates, totalling 196 fits
Completed trial 4
Running trial 5
Fitting 7 folds for each of 28 candidates, totalling 196 fits
Completed trial 5
Running trial 1
Fitting 7 folds for each of 28 candidates, totalling 196 fits
Completed trial 1
Running trial 2
Fitting 7 folds for each of 28 candidates, totalling 196 fits
Completed trial 2
Running trial 3
Fitting 7 folds for each of 28 candidates, totalling 196 fits
Completed trial 3
Running trial 4
Fitting 7 folds for each of 28 candidates, totalling 196 fits
Completed trial 4
Running trial 5
Fitting 7 folds for each of 28 candidates, totalling 196 fits
Completed trial 5


['../results/rfc_dct_results.pkl']

In [54]:
from sklearn.linear_model import LogisticRegression

alpha_arr = np.logspace(-5,5,11,base=10)

std_scaler = StandardScaler()
model = LogisticRegression()
hyperparameters = {
    'logisticregression__C': 1/alpha_arr,
    'logisticregression__penalty':['l2'],
    'logisticregression__max_iter':[10000],
    'logisticregression__multi_class':['ovr'],
    'logisticregression__solver':['liblinear']
}

score = make_scorer(f1_score, average='macro')


# lr_timefreq_results = gather_trial_results(df=time_freq_domain_df, exclftrs=['epoch_start','epoch_end','class','user'],
#                                             target='class', groupftr='user', preprocessor=std_scaler, model=model,
#                                             hyperparameters=hyperparameters, score=score,
#                                             trials=5, trial_type='time-freq')
# joblib.dump(lr_timefreq_results, '../results/lr_timefreq_results.pkl')


# lr_dct_results = gather_trial_results(df=dct_domain_df, exclftrs=['epoch_start','epoch_end','class','user'],
#                                        target='class', groupftr='user', preprocessor=std_scaler, model=model,
#                                        hyperparameters=hyperparameters, score=score,
#                                        trials=5, trial_type='dct')
# joblib.dump(lr_dct_results, '../results/lr_dct_results.pkl')


Running trial 1
Fitting 7 folds for each of 11 candidates, totalling 77 fits
Completed trial 1
Running trial 2
Fitting 7 folds for each of 11 candidates, totalling 77 fits
Completed trial 2
Running trial 3
Fitting 7 folds for each of 11 candidates, totalling 77 fits
Completed trial 3
Running trial 4
Fitting 7 folds for each of 11 candidates, totalling 77 fits
Completed trial 4
Running trial 5
Fitting 7 folds for each of 11 candidates, totalling 77 fits
Completed trial 5
Running trial 1
Fitting 7 folds for each of 11 candidates, totalling 77 fits
Completed trial 1
Running trial 2
Fitting 7 folds for each of 11 candidates, totalling 77 fits
Completed trial 2
Running trial 3
Fitting 7 folds for each of 11 candidates, totalling 77 fits
Completed trial 3
Running trial 4
Fitting 7 folds for each of 11 candidates, totalling 77 fits
Completed trial 4
Running trial 5
Fitting 7 folds for each of 11 candidates, totalling 77 fits
Completed trial 5


['../results/lr_dct_results.pkl']

In [10]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import xgboost

std_scaler = StandardScaler()
model = xgboost.XGBClassifier(use_label_encoder=True, objective='multi:softprob', verbosity = 0, silent = True)

hyperparameters = {
    'xgbclassifier__learning_rate': [0.01,0.025,0.05,0.1,0.25,0.5,1],
    'xgbclassifier__max_depth': [2, 4, 6, 8, 10, 12, 14]
}

score = make_scorer(f1_score, average='macro')

xgb_timefreq_results = gather_trial_results(df=time_freq_domain_df, exclftrs=['epoch_start','epoch_end','class','user'],
                                            target='class', groupftr='user', preprocessor=std_scaler, model=model,
                                            hyperparameters=hyperparameters, score=score,
                                            trials=5, trial_type='time-freq')

joblib.dump(lr_timefreq_results, '../results/xgb_timefreq_results.pkl')


Running trial 1
Fitting 7 folds for each of 49 candidates, totalling 343 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index








































Completed trial 1
Running trial 2
Fitting 7 folds for each of 49 candidates, totalling 343 fits










































Completed trial 2
Running trial 3
Fitting 7 folds for each of 49 candidates, totalling 343 fits








































Completed trial 3
Running trial 4
Fitting 7 folds for each of 49 candidates, totalling 343 fits








































Completed trial 4
Running trial 5
Fitting 7 folds for each of 49 candidates, totalling 343 fits








































Completed trial 5


NameError: name 'lr_timefreq_results' is not defined

In [11]:
joblib.dump(xgb_timefreq_results, '../results/xgb_timefreq_results.pkl')

['../results/xgb_timefreq_results.pkl']

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

import xgboost

std_scaler = StandardScaler()
model = xgboost.XGBClassifier(use_label_encoder=True, objective='multi:softprob', verbosity = 0, silent = True)

hyperparameters = {
    'xgbclassifier__learning_rate': [0.01,0.025,0.05,0.1,0.25,0.5,1],
    'xgbclassifier__max_depth': [2, 4, 6, 8, 10, 12, 14]
}

score = make_scorer(f1_score, average='macro')

xgb_dct_results = gather_trial_results(df=dct_domain_df, exclftrs=['epoch_start','epoch_end','class','user'],
                                            target='class', groupftr='user', preprocessor=std_scaler, model=model,
                                            hyperparameters=hyperparameters, score=score,
                                            trials=5, trial_type='time-freq')

joblib.dump(xgb_dct_results, '../results/xgb_dct_results.pkl')


Running trial 1
Fitting 7 folds for each of 49 candidates, totalling 343 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index








































Completed trial 1
Running trial 2
Fitting 7 folds for each of 49 candidates, totalling 343 fits












