In [2]:
import sys
# sys.path is a list of absolute path strings
sys.path.append('C:\Projects\Private\PropStar')
sys.path.append('C:\Projects\Private\PropStar\datasets')
from gridsearch.EstimatorSelectionHelper import EstimatorSelectionHelper

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, roc_auc_score, f1_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [4]:
search_space = {
    'RandomForestClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__max_depth': [None],  # 'null' is equivalent to None in Python
        'classifier__min_samples_split': [2],
        'classifier__min_samples_leaf': [1],
        },
    'ExtraTreesClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__max_depth': [None],
        'classifier__min_samples_split': [2],
        'classifier__min_samples_leaf': [1],
        },
    'AdaBoostClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__learning_rate': [0.01],
        },
    'GradientBoostingClassifier': {
        'classifier__n_estimators': [16, 32],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__max_depth': [5],
    }
}


In [5]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
from scipy.fftpack import fft

# Custom transformer for time-based features
class TimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, time_column_name='timestamp'):
        self.time_column_name = time_column_name
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Ensure that the input is a pandas DataFrame with datetime dtype for the timestamp column
        time_data = pd.to_datetime(X, errors='coerce')
        if time_data.dtype == '<M8[ns]':  # Check if it's a datetime Series
            # Extract time-related features
            features = pd.DataFrame({
                'hour': time_data.dt.hour,
                'minute': time_data.dt.minute,
                'second': time_data.dt.second,
                'sin_hour': np.sin(2 * np.pi * time_data.dt.hour / 24),
                'cos_hour': np.cos(2 * np.pi * time_data.dt.hour / 24)
            })
            return features
        else:
            raise ValueError("Input must be a pandas Series with datetime64 dtype.")

# Custom transformer for statistical features
class StatisticalFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, window_size=54):  # Example window size ~10 seconds
        self.window_size = window_size

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        rolling = X.rolling(window=self.window_size, min_periods=1)
        return np.hstack([rolling.mean().values, rolling.std().values])

# Custom transformer for frequency domain features
class FFTFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.abs(fft(X, axis=0))

# Column selector helper
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

sensor_columns = ['acceleration_x', 'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z']
timestamp_column = ['timestamp']

# Define the feature engineering pipeline
feature_engineering_pipeline = FeatureUnion([
    ('sensor_statistical', Pipeline([
        ('selector', ColumnSelector(sensor_columns)),
        ('statistical_features', StatisticalFeaturesExtractor()),
        ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values if necessary
        ('scaler', StandardScaler())  # Scaling features
    ])),
    ('sensor_fft', Pipeline([
        ('selector', ColumnSelector(sensor_columns)),
        ('fft', FFTFeaturesExtractor()),
        ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values if necessary
        ('scaler', StandardScaler())  # Scaling FFT features
    ])),
    # ('time_features', Pipeline([
    #     ('selector', ColumnSelector(timestamp_column)),
    #     # ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values if necessary
    #     ('time_extractor', TimeFeaturesExtractor()),
    #     ('scaler', StandardScaler())  # Scaling time features
    # ])),
    ('wrist_encoder', Pipeline([
        ('selector', ColumnSelector(['wrist'])),
        ('encoder', OneHotEncoder())
    ]))
])



# Combine feature engineering pipeline with a model in a full pipeline
full_pipeline = Pipeline([
    ('features', feature_engineering_pipeline),
    # Add your classifier pipeline here
])


# Accelometer gyro

In [8]:
data = pd.read_csv('../datasets/accelerometer_gyro_mobile_phone_dataset.csv')
data = data[~data['timestamp'].str.contains("6/25/2022")]
data['timestamp'] = '2024-01-01 00:' + data['timestamp'].str.replace(',', '.')

# Convert to datetime
data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')

In [9]:
data

Unnamed: 0,accX,accY,accZ,gyroX,gyroY,gyroZ,timestamp,Activity
0,-0.496517,3.785628,8.954828,-0.142849,-0.126159,-0.022539,2024-01-01 00:34:22.900,1
1,-0.462388,3.869603,9.281898,0.084349,0.096695,0.092130,2024-01-01 00:34:23.000,1
2,-0.296084,3.820505,8.930728,0.061763,0.051543,0.071287,2024-01-01 00:34:23.100,1
3,-0.469723,3.890110,8.744067,0.007641,0.028679,0.109433,2024-01-01 00:34:23.200,1
4,-0.472418,4.109105,8.941207,-0.123640,0.099057,0.051943,2024-01-01 00:34:23.300,1
...,...,...,...,...,...,...,...,...
31986,-0.488734,1.610800,10.610386,0.079187,-0.174218,-0.050365,2024-01-01 00:03:15.100,1
31987,-0.049397,2.769092,7.008276,-0.083853,0.007656,-0.045658,2024-01-01 00:03:15.200,1
31988,0.291294,3.002007,6.732400,0.005984,-0.058994,-0.087044,2024-01-01 00:03:15.300,1
31989,0.256267,4.069138,8.687933,0.061487,-0.016278,-0.088728,2024-01-01 00:03:15.400,1


In [10]:
data.groupby('Activity')['Activity'].count()

Activity
0      570
1    31420
Name: Activity, dtype: int64

In [117]:
X = data.drop('Activity', axis=1)
y = data['Activity']

In [None]:
es = EstimatorSelectionHelper(search_space=search_space)
es.fit(X, y, fe_pipeline=feature_engineering_pipeline, scoring=scoring)

Running GridSearchCV for RandomForestClassifier.
Fitting 10 folds for each of 2 candidates, totalling 20 fits


In [113]:
for name, pipeline in feature_engineering_pipeline.transformer_list:
    print(f"Testing {name} pipeline")
    try:
        transformed_sample = pipeline.fit_transform(X)
        # print(f"{name} pipeline output:\n", transformed_sample)
    except Exception as e:
        print(f"Error in {name} pipeline: {e}")

Testing sensor_statistical pipeline
Testing sensor_fft pipeline
Testing wrist_encoder pipeline


In [102]:
try:
    selector = ColumnSelector('timestamp')
    selected_data = selector.fit_transform(X)
    print("ColumnSelector output:\n", selected_data)
    
    time_extractor = TimeFeaturesExtractor()
    time_extracted = time_extractor.fit_transform(selected_data)
    print("Time extractor output:\n", time_extracted)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(time_extracted)
    print("Scaled data output:\n", scaled_data)

    # Repeat for each custom transformer
except Exception as e:
    print("Error in transformer: ", e)

ColumnSelector output:
 0       2017-06-30 13:51:15.847724020
1       2017-06-30 13:51:16.246945023
2       2017-06-30 13:51:16.446233987
3       2017-06-30 13:51:16.646117985
4       2017-06-30 13:51:16.846738994
                     ...             
88583   2017-07-09 20:09:15.317911028
88584   2017-07-09 20:09:15.517889022
88585   2017-07-09 20:09:15.717828989
88586   2017-07-09 20:09:15.917932987
88587   2017-07-09 20:09:16.117410004
Name: timestamp, Length: 88588, dtype: datetime64[ns]
Time extractor output:
        hour  minute  second  sin_hour  cos_hour
0        13      51      15 -0.258819 -0.965926
1        13      51      16 -0.258819 -0.965926
2        13      51      16 -0.258819 -0.965926
3        13      51      16 -0.258819 -0.965926
4        13      51      16 -0.258819 -0.965926
...     ...     ...     ...       ...       ...
88583    20       9      15 -0.866025  0.500000
88584    20       9      15 -0.866025  0.500000
88585    20       9      15 -0.866025  0.500000


In [86]:
pipeline.verbose

False