In [1]:
import sys
# sys.path is a list of absolute path strings
sys.path.append('C:\Projects\Private\PropStar')
sys.path.append('C:\Projects\Private\PropStar\datasets')
from gridsearch.EstimatorSelectionHelper import EstimatorSelectionHelper

In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


In [33]:
search_space = {
    'RandomForestClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__max_depth': [None],  # 'null' is equivalent to None in Python
        'classifier__min_samples_split': [2],
        'classifier__min_samples_leaf': [1],
        },
    'ExtraTreesClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__max_depth': [None],
        'classifier__min_samples_split': [2],
        'classifier__min_samples_leaf': [1],
        },
    'AdaBoostClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__learning_rate': [0.01],
        },
    'GradientBoostingClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__max_depth': [5],
    }
}


In [56]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
from scipy.fftpack import fft

# Custom transformer for time-based features
class TimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, time_column_name='timestamp'):
        self.time_column_name = time_column_name
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Ensure that the input is a pandas DataFrame with datetime dtype for the timestamp column
        time_data = pd.to_datetime(X, errors='coerce')
        if time_data.dtype == '<M8[ns]':  # Check if it's a datetime Series
            # Extract time-related features
            features = pd.DataFrame({
                'year': time_data.dt.year,
                'month': time_data.dt.month,
                'day': time_data.dt.day,
                'hour': time_data.dt.hour,
                'minute': time_data.dt.minute,
                'second': time_data.dt.second,
                
                # 'sin_hour': np.sin(2 * np.pi * time_data.dt.hour / 24),
                # 'cos_hour': np.cos(2 * np.pi * time_data.dt.hour / 24)
            })
            return features
        else:
            raise ValueError("Input must be a pandas Series with datetime64 dtype.")

# Custom transformer for statistical features
class StatisticalFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, window_size=54):  # Example window size ~10 seconds
        self.window_size = window_size

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        rolling = X.rolling(window=self.window_size, min_periods=1)
        return np.hstack([rolling.mean().values, rolling.std().values])

# Custom transformer for frequency domain features
class FFTFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.abs(fft(X, axis=0))

# Column selector helper
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

sensor_columns = ['acceleration_x', 'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z']
timestamp_column = ['timestamp']

# Define the feature engineering pipeline
feature_engineering_pipeline = FeatureUnion([
    ('time_features', ColumnTransformer([('time_extractor', TimeFeaturesExtractor(), 'timestamp')])),
    ('sensor_statistical', Pipeline([
        ('selector', ColumnSelector(sensor_columns)),
        ('statistical_features', StatisticalFeaturesExtractor()),
        ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values if necessary
        ('scaler', StandardScaler())  # Scaling features
    ])),
    ('sensor_fft', Pipeline([
        ('selector', ColumnSelector(sensor_columns)),
        ('fft', FFTFeaturesExtractor()),
        ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values if necessary
        ('scaler', StandardScaler())  # Scaling FFT features
    ])),

    ('wrist_encoder', Pipeline([
        ('selector', ColumnSelector(['wrist'])),
        ('encoder', OneHotEncoder())
    ]))
])



# Combine feature engineering pipeline with a model in a full pipeline
full_pipeline = Pipeline([
    ('features', feature_engineering_pipeline),
    # Add your classifier pipeline here
])

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'roc_auc_score': make_scorer(roc_auc_score)
}

# Kinematics data

In [57]:
data = pd.read_csv('../datasets/Kinematics_Data.csv')
data.drop('username', axis=1, inplace=True)
data['timestamp'] = pd.to_datetime(data['date'] + ' ' + data['time'],format='%Y-%m-%d %H:%M:%S:%f')
data.drop(['date', 'time'], axis=1, inplace=True)

In [58]:
data

Unnamed: 0,wrist,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,timestamp
0,0,0,0.2650,-0.7814,-0.0076,-0.0590,0.0325,-2.9296,2017-06-30 13:51:15.847724020
1,0,0,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269,2017-06-30 13:51:16.246945023
2,0,0,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367,2017-06-30 13:51:16.446233987
3,0,0,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336,2017-06-30 13:51:16.646117985
4,0,0,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922,2017-06-30 13:51:16.846738994
...,...,...,...,...,...,...,...,...,...
88583,0,0,0.3084,-0.8376,-0.1327,0.4823,2.0124,0.6048,2017-07-09 20:09:15.317911028
88584,0,0,0.4977,-1.0027,-0.4397,0.1022,-1.2565,-0.0761,2017-07-09 20:09:15.517889022
88585,0,0,0.4587,-1.1780,-0.2827,-1.4500,-0.2792,-1.2616,2017-07-09 20:09:15.717828989
88586,0,0,0.2590,-0.8582,-0.0759,-1.5165,0.4560,-1.7755,2017-07-09 20:09:15.917932987


In [59]:
X = data.drop('activity', axis=1)
y = data['activity']

In [60]:
es = EstimatorSelectionHelper(search_space=search_space)
es.fit(X, y, fe_pipeline=feature_engineering_pipeline, scoring=scoring)

Running GridSearchCV for RandomForestClassifier.
Fitting 10 folds for each of 2 candidates, totalling 20 fits


KeyboardInterrupt: 

In [51]:
for name, pipeline in feature_engineering_pipeline.transformer_list:
    print(f"Testing {name} pipeline")
    try:
        transformed_sample = pipeline.fit_transform(X)
        # print(f"{name} pipeline output:\n", transformed_sample)
    except Exception as e:
        print(f"Error in {name} pipeline: {e}")

Testing time_features pipeline
Error in time_features pipeline: to assemble mappings requires at least that [year, month, day] be specified: [day,month,year] is missing
Testing sensor_statistical pipeline
Testing sensor_fft pipeline
Testing wrist_encoder pipeline


In [54]:
try:
    ct = ColumnTransformer([('time_extractor', TimeFeaturesExtractor(), 'timestamp')])
    ct.fit_transform(X)
    # Repeat for each custom transformer
except Exception as e:
    print("Error in transformer: ", e)

In [55]:
X['timestamp'].dt.year

0        2017
1        2017
2        2017
3        2017
4        2017
         ... 
88583    2017
88584    2017
88585    2017
88586    2017
88587    2017
Name: timestamp, Length: 88588, dtype: int32